]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/bhyve/pci_nvme.c
bhyve: Fix NVMe data structure copy to guest
[FreeBSD/FreeBSD.git] / usr.sbin / bhyve / pci_nvme.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28
29 /*
30  * bhyve PCIe-NVMe device emulation.
31  *
32  * options:
33  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z
34  *
35  *  accepted devpath:
36  *    /dev/blockdev
37  *    /path/to/image
38  *    ram=size_in_MiB
39  *
40  *  maxq    = max number of queues
41  *  qsz     = max elements in each queue
42  *  ioslots = max number of concurrent io requests
43  *  sectsz  = sector size (defaults to blockif sector size)
44  *  ser     = serial number (20-chars max)
45  *
46  */
47
48 /* TODO:
49     - create async event for smart and log
50     - intr coalesce
51  */
52
53 #include <sys/cdefs.h>
54 __FBSDID("$FreeBSD$");
55
56 #include <sys/types.h>
57
58 #include <assert.h>
59 #include <pthread.h>
60 #include <semaphore.h>
61 #include <stdbool.h>
62 #include <stddef.h>
63 #include <stdint.h>
64 #include <stdio.h>
65 #include <stdlib.h>
66 #include <string.h>
67
68 #include <machine/atomic.h>
69 #include <machine/vmm.h>
70 #include <vmmapi.h>
71
72 #include <dev/nvme/nvme.h>
73
74 #include "bhyverun.h"
75 #include "block_if.h"
76 #include "pci_emul.h"
77
78
79 static int nvme_debug = 0;
80 #define DPRINTF(params) if (nvme_debug) printf params
81 #define WPRINTF(params) printf params
82
83 /* defaults; can be overridden */
84 #define NVME_MSIX_BAR           4
85
86 #define NVME_IOSLOTS            8
87
88 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
89 #define NVME_MMIO_SPACE_MIN     (1 << 14)
90
91 #define NVME_QUEUES             16
92 #define NVME_MAX_QENTRIES       2048
93
94 #define NVME_PRP2_ITEMS         (PAGE_SIZE/sizeof(uint64_t))
95 #define NVME_MAX_BLOCKIOVS      512
96
97 /* helpers */
98
99 /* Convert a zero-based value into a one-based value */
100 #define ONE_BASED(zero)         ((zero) + 1)
101 /* Convert a one-based value into a zero-based value */
102 #define ZERO_BASED(one)         ((one)  - 1)
103
104 /* Encode number of SQ's and CQ's for Set/Get Features */
105 #define NVME_FEATURE_NUM_QUEUES(sc) \
106         (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
107         (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
108
109 #define NVME_DOORBELL_OFFSET    offsetof(struct nvme_registers, doorbell)
110
111 enum nvme_controller_register_offsets {
112         NVME_CR_CAP_LOW = 0x00,
113         NVME_CR_CAP_HI  = 0x04,
114         NVME_CR_VS      = 0x08,
115         NVME_CR_INTMS   = 0x0c,
116         NVME_CR_INTMC   = 0x10,
117         NVME_CR_CC      = 0x14,
118         NVME_CR_CSTS    = 0x1c,
119         NVME_CR_NSSR    = 0x20,
120         NVME_CR_AQA     = 0x24,
121         NVME_CR_ASQ_LOW = 0x28,
122         NVME_CR_ASQ_HI  = 0x2c,
123         NVME_CR_ACQ_LOW = 0x30,
124         NVME_CR_ACQ_HI  = 0x34,
125 };
126
127 enum nvme_cmd_cdw11 {
128         NVME_CMD_CDW11_PC  = 0x0001,
129         NVME_CMD_CDW11_IEN = 0x0002,
130         NVME_CMD_CDW11_IV  = 0xFFFF0000,
131 };
132
133 #define NVME_CQ_INTEN   0x01
134 #define NVME_CQ_INTCOAL 0x02
135
136 struct nvme_completion_queue {
137         struct nvme_completion *qbase;
138         uint32_t        size;
139         uint16_t        tail; /* nvme progress */
140         uint16_t        head; /* guest progress */
141         uint16_t        intr_vec;
142         uint32_t        intr_en;
143         pthread_mutex_t mtx;
144 };
145
146 struct nvme_submission_queue {
147         struct nvme_command *qbase;
148         uint32_t        size;
149         uint16_t        head; /* nvme progress */
150         uint16_t        tail; /* guest progress */
151         uint16_t        cqid; /* completion queue id */
152         int             busy; /* queue is being processed */
153         int             qpriority;
154 };
155
156 enum nvme_storage_type {
157         NVME_STOR_BLOCKIF = 0,
158         NVME_STOR_RAM = 1,
159 };
160
161 struct pci_nvme_blockstore {
162         enum nvme_storage_type type;
163         void            *ctx;
164         uint64_t        size;
165         uint32_t        sectsz;
166         uint32_t        sectsz_bits;
167 };
168
169 struct pci_nvme_ioreq {
170         struct pci_nvme_softc *sc;
171         struct pci_nvme_ioreq *next;
172         struct nvme_submission_queue *nvme_sq;
173         uint16_t        sqid;
174
175         /* command information */
176         uint16_t        opc;
177         uint16_t        cid;
178         uint32_t        nsid;
179
180         uint64_t        prev_gpaddr;
181         size_t          prev_size;
182
183         /*
184          * lock if all iovs consumed (big IO);
185          * complete transaction before continuing
186          */
187         pthread_mutex_t mtx;
188         pthread_cond_t  cv;
189
190         struct blockif_req io_req;
191
192         /* pad to fit up to 512 page descriptors from guest IO request */
193         struct iovec    iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
194 };
195
196 struct pci_nvme_softc {
197         struct pci_devinst *nsc_pi;
198
199         pthread_mutex_t mtx;
200
201         struct nvme_registers regs;
202
203         struct nvme_namespace_data  nsdata;
204         struct nvme_controller_data ctrldata;
205         struct nvme_error_information_entry err_log;
206         struct nvme_health_information_page health_log;
207         struct nvme_firmware_page fw_log;
208
209         struct pci_nvme_blockstore nvstore;
210
211         uint16_t        max_qentries;   /* max entries per queue */
212         uint32_t        max_queues;     /* max number of IO SQ's or CQ's */
213         uint32_t        num_cqueues;
214         uint32_t        num_squeues;
215
216         struct pci_nvme_ioreq *ioreqs;
217         struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */
218         uint32_t        pending_ios;
219         uint32_t        ioslots;
220         sem_t           iosemlock;
221
222         /*
223          * Memory mapped Submission and Completion queues
224          * Each array includes both Admin and IO queues
225          */
226         struct nvme_completion_queue *compl_queues;
227         struct nvme_submission_queue *submit_queues;
228
229         /* controller features */
230         uint32_t        intr_coales_aggr_time;   /* 0x08: uS to delay intr */
231         uint32_t        intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
232         uint32_t        async_ev_config;         /* 0x0B: async event config */
233 };
234
235
236 static void pci_nvme_io_partial(struct blockif_req *br, int err);
237
238 /* Controller Configuration utils */
239 #define NVME_CC_GET_EN(cc) \
240         ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
241 #define NVME_CC_GET_CSS(cc) \
242         ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
243 #define NVME_CC_GET_SHN(cc) \
244         ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
245 #define NVME_CC_GET_IOSQES(cc) \
246         ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
247 #define NVME_CC_GET_IOCQES(cc) \
248         ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
249
250 #define NVME_CC_WRITE_MASK \
251         ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
252          (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
253          (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
254
255 #define NVME_CC_NEN_WRITE_MASK \
256         ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
257          (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
258          (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
259
260 /* Controller Status utils */
261 #define NVME_CSTS_GET_RDY(sts) \
262         ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
263
264 #define NVME_CSTS_RDY   (1 << NVME_CSTS_REG_RDY_SHIFT)
265
266 /* Completion Queue status word utils */
267 #define NVME_STATUS_P   (1 << NVME_STATUS_P_SHIFT)
268 #define NVME_STATUS_MASK \
269         ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
270          (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
271
272 static __inline void
273 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
274 {
275         size_t len;
276
277         len = strnlen(src, dst_size);
278         memset(dst, pad, dst_size);
279         memcpy(dst, src, len);
280 }
281
282 static __inline void
283 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
284 {
285
286         *status &= ~NVME_STATUS_MASK;
287         *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
288                 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
289 }
290
291 static __inline void
292 pci_nvme_status_genc(uint16_t *status, uint16_t code)
293 {
294
295         pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
296 }
297
298 static __inline void
299 pci_nvme_toggle_phase(uint16_t *status, int prev)
300 {
301
302         if (prev)
303                 *status &= ~NVME_STATUS_P;
304         else
305                 *status |= NVME_STATUS_P;
306 }
307
308 static void
309 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
310 {
311         struct nvme_controller_data *cd = &sc->ctrldata;
312
313         cd->vid = 0xFB5D;
314         cd->ssvid = 0x0000;
315
316         cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
317         cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
318
319         /* Num of submission commands that we can handle at a time (2^rab) */
320         cd->rab   = 4;
321
322         /* FreeBSD OUI */
323         cd->ieee[0] = 0x58;
324         cd->ieee[1] = 0x9c;
325         cd->ieee[2] = 0xfc;
326
327         cd->mic = 0;
328
329         cd->mdts = 9;   /* max data transfer size (2^mdts * CAP.MPSMIN) */
330
331         cd->ver = 0x00010300;
332
333         cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
334         cd->acl = 2;
335         cd->aerl = 4;
336
337         cd->lpa = 0;    /* TODO: support some simple things like SMART */
338         cd->elpe = 0;   /* max error log page entries */
339         cd->npss = 1;   /* number of power states support */
340
341         /* Warning Composite Temperature Threshold */
342         cd->wctemp = 0x0157;
343
344         cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
345             (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
346         cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
347             (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
348         cd->nn = 1;     /* number of namespaces */
349
350         cd->fna = 0x03;
351
352         cd->power_state[0].mp = 10;
353 }
354
355 static void
356 pci_nvme_init_nsdata(struct pci_nvme_softc *sc)
357 {
358         struct nvme_namespace_data *nd;
359
360         nd = &sc->nsdata;
361
362         nd->nsze = sc->nvstore.size / sc->nvstore.sectsz;
363         nd->ncap = nd->nsze;
364         nd->nuse = nd->nsze;
365
366         /* Get LBA and backstore information from backing store */
367         nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
368         /* LBA data-sz = 2^lbads */
369         nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
370
371         nd->flbas = 0;
372 }
373
374 static void
375 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
376 {
377
378         memset(&sc->err_log, 0, sizeof(sc->err_log));
379         memset(&sc->health_log, 0, sizeof(sc->health_log));
380         memset(&sc->fw_log, 0, sizeof(sc->fw_log));
381 }
382
383 static void
384 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
385 {
386         DPRINTF(("%s\r\n", __func__));
387
388         sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
389             (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
390             (60 << NVME_CAP_LO_REG_TO_SHIFT);
391
392         sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
393
394         sc->regs.vs = 0x00010300;       /* NVMe v1.3 */
395
396         sc->regs.cc = 0;
397         sc->regs.csts = 0;
398
399         sc->num_cqueues = sc->num_squeues = sc->max_queues;
400         if (sc->submit_queues != NULL) {
401                 for (int i = 0; i < sc->num_squeues + 1; i++) {
402                         /*
403                          * The Admin Submission Queue is at index 0.
404                          * It must not be changed at reset otherwise the
405                          * emulation will be out of sync with the guest.
406                          */
407                         if (i != 0) {
408                                 sc->submit_queues[i].qbase = NULL;
409                                 sc->submit_queues[i].size = 0;
410                                 sc->submit_queues[i].cqid = 0;
411                         }
412                         sc->submit_queues[i].tail = 0;
413                         sc->submit_queues[i].head = 0;
414                         sc->submit_queues[i].busy = 0;
415                 }
416         } else
417                 sc->submit_queues = calloc(sc->num_squeues + 1,
418                                         sizeof(struct nvme_submission_queue));
419
420         if (sc->compl_queues != NULL) {
421                 for (int i = 0; i < sc->num_cqueues + 1; i++) {
422                         /* See Admin Submission Queue note above */
423                         if (i != 0) {
424                                 sc->compl_queues[i].qbase = NULL;
425                                 sc->compl_queues[i].size = 0;
426                         }
427
428                         sc->compl_queues[i].tail = 0;
429                         sc->compl_queues[i].head = 0;
430                 }
431         } else {
432                 sc->compl_queues = calloc(sc->num_cqueues + 1,
433                                         sizeof(struct nvme_completion_queue));
434
435                 for (int i = 0; i < sc->num_cqueues + 1; i++)
436                         pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
437         }
438 }
439
440 static void
441 pci_nvme_reset(struct pci_nvme_softc *sc)
442 {
443         pthread_mutex_lock(&sc->mtx);
444         pci_nvme_reset_locked(sc);
445         pthread_mutex_unlock(&sc->mtx);
446 }
447
448 static void
449 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
450 {
451         uint16_t acqs, asqs;
452
453         DPRINTF(("%s\r\n", __func__));
454
455         asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
456         sc->submit_queues[0].size = asqs;
457         sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
458                     sizeof(struct nvme_command) * asqs);
459
460         DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p\r\n",
461                 __func__, sc->regs.asq, sc->submit_queues[0].qbase));
462
463         acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 
464             NVME_AQA_REG_ACQS_MASK) + 1;
465         sc->compl_queues[0].size = acqs;
466         sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
467                  sizeof(struct nvme_completion) * acqs);
468         DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p\r\n",
469                 __func__, sc->regs.acq, sc->compl_queues[0].qbase));
470 }
471
472 static int
473 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *src,
474         size_t len)
475 {
476         uint8_t *dst;
477         size_t bytes;
478
479         if (len > (8 * 1024)) {
480                 return (-1);
481         }
482
483         /* Copy from the start of prp1 to the end of the physical page */
484         bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
485         bytes = MIN(bytes, len);
486
487         dst = vm_map_gpa(ctx, prp1, bytes);
488         if (dst == NULL) {
489                 return (-1);
490         }
491
492         memcpy(dst, src, bytes);
493
494         src += bytes;
495
496         len -= bytes;
497         if (len == 0) {
498                 return (0);
499         }
500
501         len = MIN(len, PAGE_SIZE);
502
503         dst = vm_map_gpa(ctx, prp2, len);
504         if (dst == NULL) {
505                 return (-1);
506         }
507
508         memcpy(dst, src, len);
509
510         return (0);
511 }
512
513 static int
514 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
515         struct nvme_completion* compl)
516 {
517         uint16_t qid = command->cdw10 & 0xffff;
518
519         DPRINTF(("%s DELETE_IO_SQ %u\r\n", __func__, qid));
520         if (qid == 0 || qid > sc->num_squeues) {
521                 WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u\r\n",
522                         __func__, qid, sc->num_squeues));
523                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
524                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
525                 return (1);
526         }
527
528         sc->submit_queues[qid].qbase = NULL;
529         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
530         return (1);
531 }
532
533 static int
534 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
535         struct nvme_completion* compl)
536 {
537         if (command->cdw11 & NVME_CMD_CDW11_PC) {
538                 uint16_t qid = command->cdw10 & 0xffff;
539                 struct nvme_submission_queue *nsq;
540
541                 if ((qid == 0) || (qid > sc->num_squeues)) {
542                         WPRINTF(("%s queue index %u > num_squeues %u\r\n",
543                                 __func__, qid, sc->num_squeues));
544                         pci_nvme_status_tc(&compl->status,
545                             NVME_SCT_COMMAND_SPECIFIC,
546                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
547                         return (1);
548                 }
549
550                 nsq = &sc->submit_queues[qid];
551                 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
552
553                 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
554                               sizeof(struct nvme_command) * (size_t)nsq->size);
555                 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
556                 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
557
558                 DPRINTF(("%s sq %u size %u gaddr %p cqid %u\r\n", __func__,
559                         qid, nsq->size, nsq->qbase, nsq->cqid));
560
561                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
562
563                 DPRINTF(("%s completed creating IOSQ qid %u\r\n",
564                          __func__, qid));
565         } else {
566                 /* 
567                  * Guest sent non-cont submission queue request.
568                  * This setting is unsupported by this emulation.
569                  */
570                 WPRINTF(("%s unsupported non-contig (list-based) "
571                          "create i/o submission queue\r\n", __func__));
572
573                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
574         }
575         return (1);
576 }
577
578 static int
579 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
580         struct nvme_completion* compl)
581 {
582         uint16_t qid = command->cdw10 & 0xffff;
583
584         DPRINTF(("%s DELETE_IO_CQ %u\r\n", __func__, qid));
585         if (qid == 0 || qid > sc->num_cqueues) {
586                 WPRINTF(("%s queue index %u / num_cqueues %u\r\n",
587                         __func__, qid, sc->num_cqueues));
588                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
589                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
590                 return (1);
591         }
592
593         sc->compl_queues[qid].qbase = NULL;
594         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
595         return (1);
596 }
597
598 static int
599 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
600         struct nvme_completion* compl)
601 {
602         if (command->cdw11 & NVME_CMD_CDW11_PC) {
603                 uint16_t qid = command->cdw10 & 0xffff;
604                 struct nvme_completion_queue *ncq;
605
606                 if ((qid == 0) || (qid > sc->num_cqueues)) {
607                         WPRINTF(("%s queue index %u > num_cqueues %u\r\n",
608                                 __func__, qid, sc->num_cqueues));
609                         pci_nvme_status_tc(&compl->status,
610                             NVME_SCT_COMMAND_SPECIFIC,
611                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
612                         return (1);
613                 }
614
615                 ncq = &sc->compl_queues[qid];
616                 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
617                 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
618                 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
619
620                 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
621                              command->prp1,
622                              sizeof(struct nvme_command) * (size_t)ncq->size);
623
624                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
625         } else {
626                 /* 
627                  * Non-contig completion queue unsupported.
628                  */
629                 WPRINTF(("%s unsupported non-contig (list-based) "
630                          "create i/o completion queue\r\n",
631                          __func__));
632
633                 /* 0x12 = Invalid Use of Controller Memory Buffer */
634                 pci_nvme_status_genc(&compl->status, 0x12);
635         }
636
637         return (1);
638 }
639
640 static int
641 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
642         struct nvme_completion* compl)
643 {
644         uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
645         uint8_t logpage = command->cdw10 & 0xFF;
646
647         DPRINTF(("%s log page %u len %u\r\n", __func__, logpage, logsize));
648
649         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
650
651         switch (logpage) {
652         case NVME_LOG_ERROR:
653                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
654                     command->prp2, (uint8_t *)&sc->err_log, logsize);
655                 break;
656         case NVME_LOG_HEALTH_INFORMATION:
657                 /* TODO: present some smart info */
658                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
659                     command->prp2, (uint8_t *)&sc->health_log, logsize);
660                 break;
661         case NVME_LOG_FIRMWARE_SLOT:
662                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
663                     command->prp2, (uint8_t *)&sc->fw_log, logsize);
664                 break;
665         default:
666                 WPRINTF(("%s get log page %x command not supported\r\n",
667                         __func__, logpage));
668
669                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
670                     NVME_SC_INVALID_LOG_PAGE);
671         }
672
673         return (1);
674 }
675
676 static int
677 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
678         struct nvme_completion* compl)
679 {
680         void *dest;
681
682         DPRINTF(("%s identify 0x%x nsid 0x%x\r\n", __func__,
683                 command->cdw10 & 0xFF, command->nsid));
684
685         switch (command->cdw10 & 0xFF) {
686         case 0x00: /* return Identify Namespace data structure */
687                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
688                     command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata));
689                 break;
690         case 0x01: /* return Identify Controller data structure */
691                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
692                     command->prp2, (uint8_t *)&sc->ctrldata,
693                     sizeof(sc->ctrldata));
694                 break;
695         case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
696                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
697                                   sizeof(uint32_t) * 1024);
698                 ((uint32_t *)dest)[0] = 1;
699                 ((uint32_t *)dest)[1] = 0;
700                 break;
701         case 0x11:
702                 pci_nvme_status_genc(&compl->status,
703                     NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
704                 return (1);
705         case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
706         case 0x10:
707         case 0x12:
708         case 0x13:
709         case 0x14:
710         case 0x15:
711         default:
712                 DPRINTF(("%s unsupported identify command requested 0x%x\r\n",
713                          __func__, command->cdw10 & 0xFF));
714                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
715                 return (1);
716         }
717
718         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
719         return (1);
720 }
721
722 static int
723 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
724         struct nvme_completion* compl)
725 {
726         uint16_t nqr;   /* Number of Queues Requested */
727
728         nqr = command->cdw11 & 0xFFFF;
729         if (nqr == 0xffff) {
730                 WPRINTF(("%s: Illegal NSQR value %#x\n", __func__, nqr));
731                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
732                 return (-1);
733         }
734
735         sc->num_squeues = ONE_BASED(nqr);
736         if (sc->num_squeues > sc->max_queues) {
737                 DPRINTF(("NSQR=%u is greater than max %u\n", sc->num_squeues,
738                                         sc->max_queues));
739                 sc->num_squeues = sc->max_queues;
740         }
741
742         nqr = (command->cdw11 >> 16) & 0xFFFF;
743         if (nqr == 0xffff) {
744                 WPRINTF(("%s: Illegal NCQR value %#x\n", __func__, nqr));
745                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
746                 return (-1);
747         }
748
749         sc->num_cqueues = ONE_BASED(nqr);
750         if (sc->num_cqueues > sc->max_queues) {
751                 DPRINTF(("NCQR=%u is greater than max %u\n", sc->num_cqueues,
752                                         sc->max_queues));
753                 sc->num_cqueues = sc->max_queues;
754         }
755
756         compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
757
758         return (0);
759 }
760
761 static int
762 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
763         struct nvme_completion* compl)
764 {
765         int feature = command->cdw10 & 0xFF;
766         uint32_t iv;
767
768         DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
769         compl->cdw0 = 0;
770
771         switch (feature) {
772         case NVME_FEAT_ARBITRATION:
773                 DPRINTF(("  arbitration 0x%x\r\n", command->cdw11));
774                 break;
775         case NVME_FEAT_POWER_MANAGEMENT:
776                 DPRINTF(("  power management 0x%x\r\n", command->cdw11));
777                 break;
778         case NVME_FEAT_LBA_RANGE_TYPE:
779                 DPRINTF(("  lba range 0x%x\r\n", command->cdw11));
780                 break;
781         case NVME_FEAT_TEMPERATURE_THRESHOLD:
782                 DPRINTF(("  temperature threshold 0x%x\r\n", command->cdw11));
783                 break;
784         case NVME_FEAT_ERROR_RECOVERY:
785                 DPRINTF(("  error recovery 0x%x\r\n", command->cdw11));
786                 break;
787         case NVME_FEAT_VOLATILE_WRITE_CACHE:
788                 DPRINTF(("  volatile write cache 0x%x\r\n", command->cdw11));
789                 break;
790         case NVME_FEAT_NUMBER_OF_QUEUES:
791                 nvme_set_feature_queues(sc, command, compl);
792                 break;
793         case NVME_FEAT_INTERRUPT_COALESCING:
794                 DPRINTF(("  interrupt coalescing 0x%x\r\n", command->cdw11));
795
796                 /* in uS */
797                 sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
798
799                 sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
800                 break;
801         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
802                 iv = command->cdw11 & 0xFFFF;
803
804                 DPRINTF(("  interrupt vector configuration 0x%x\r\n",
805                         command->cdw11));
806
807                 for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
808                         if (sc->compl_queues[i].intr_vec == iv) {
809                                 if (command->cdw11 & (1 << 16))
810                                         sc->compl_queues[i].intr_en |=
811                                                               NVME_CQ_INTCOAL;  
812                                 else
813                                         sc->compl_queues[i].intr_en &=
814                                                              ~NVME_CQ_INTCOAL;  
815                         }
816                 }
817                 break;
818         case NVME_FEAT_WRITE_ATOMICITY:
819                 DPRINTF(("  write atomicity 0x%x\r\n", command->cdw11));
820                 break;
821         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
822                 DPRINTF(("  async event configuration 0x%x\r\n",
823                         command->cdw11));
824                 sc->async_ev_config = command->cdw11;
825                 break;
826         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
827                 DPRINTF(("  software progress marker 0x%x\r\n",
828                         command->cdw11));
829                 break;
830         case 0x0C:
831                 DPRINTF(("  autonomous power state transition 0x%x\r\n",
832                         command->cdw11));
833                 break;
834         default:
835                 WPRINTF(("%s invalid feature\r\n", __func__));
836                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
837                 return (1);
838         }
839
840         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
841         return (1);
842 }
843
844 static int
845 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
846         struct nvme_completion* compl)
847 {
848         int feature = command->cdw10 & 0xFF;
849
850         DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
851
852         compl->cdw0 = 0;
853
854         switch (feature) {
855         case NVME_FEAT_ARBITRATION:
856                 DPRINTF(("  arbitration\r\n"));
857                 break;
858         case NVME_FEAT_POWER_MANAGEMENT:
859                 DPRINTF(("  power management\r\n"));
860                 break;
861         case NVME_FEAT_LBA_RANGE_TYPE:
862                 DPRINTF(("  lba range\r\n"));
863                 break;
864         case NVME_FEAT_TEMPERATURE_THRESHOLD:
865                 DPRINTF(("  temperature threshold\r\n"));
866                 switch ((command->cdw11 >> 20) & 0x3) {
867                 case 0:
868                         /* Over temp threshold */
869                         compl->cdw0 = 0xFFFF;
870                         break;
871                 case 1:
872                         /* Under temp threshold */
873                         compl->cdw0 = 0;
874                         break;
875                 default:
876                         WPRINTF(("  invalid threshold type select\r\n"));
877                         pci_nvme_status_genc(&compl->status,
878                             NVME_SC_INVALID_FIELD);
879                         return (1);
880                 }
881                 break;
882         case NVME_FEAT_ERROR_RECOVERY:
883                 DPRINTF(("  error recovery\r\n"));
884                 break;
885         case NVME_FEAT_VOLATILE_WRITE_CACHE:
886                 DPRINTF(("  volatile write cache\r\n"));
887                 break;
888         case NVME_FEAT_NUMBER_OF_QUEUES:
889                 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
890
891                 DPRINTF(("  number of queues (submit %u, completion %u)\r\n",
892                         compl->cdw0 & 0xFFFF,
893                         (compl->cdw0 >> 16) & 0xFFFF));
894
895                 break;
896         case NVME_FEAT_INTERRUPT_COALESCING:
897                 DPRINTF(("  interrupt coalescing\r\n"));
898                 break;
899         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
900                 DPRINTF(("  interrupt vector configuration\r\n"));
901                 break;
902         case NVME_FEAT_WRITE_ATOMICITY:
903                 DPRINTF(("  write atomicity\r\n"));
904                 break;
905         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
906                 DPRINTF(("  async event configuration\r\n"));
907                 sc->async_ev_config = command->cdw11;
908                 break;
909         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
910                 DPRINTF(("  software progress marker\r\n"));
911                 break;
912         case 0x0C:
913                 DPRINTF(("  autonomous power state transition\r\n"));
914                 break;
915         default:
916                 WPRINTF(("%s invalid feature 0x%x\r\n", __func__, feature));
917                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
918                 return (1);
919         }
920
921         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
922         return (1);
923 }
924
925 static int
926 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
927         struct nvme_completion* compl)
928 {
929         DPRINTF(("%s submission queue %u, command ID 0x%x\r\n", __func__,
930                 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF));
931
932         /* TODO: search for the command ID and abort it */
933
934         compl->cdw0 = 1;
935         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
936         return (1);
937 }
938
939 static int
940 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
941         struct nvme_command* command, struct nvme_completion* compl)
942 {
943         DPRINTF(("%s async event request 0x%x\r\n", __func__, command->cdw11));
944
945         /*
946          * TODO: raise events when they happen based on the Set Features cmd.
947          * These events happen async, so only set completion successful if
948          * there is an event reflective of the request to get event.
949          */
950         pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
951             NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
952         return (0);
953 }
954
955 static void
956 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
957 {
958         struct nvme_completion compl;
959         struct nvme_command *cmd;
960         struct nvme_submission_queue *sq;
961         struct nvme_completion_queue *cq;
962         int do_intr = 0;
963         uint16_t sqhead;
964
965         DPRINTF(("%s index %u\r\n", __func__, (uint32_t)value));
966
967         sq = &sc->submit_queues[0];
968
969         sqhead = atomic_load_acq_short(&sq->head);
970
971         if (atomic_testandset_int(&sq->busy, 1)) {
972                 DPRINTF(("%s SQ busy, head %u, tail %u\r\n",
973                         __func__, sqhead, sq->tail));
974                 return;
975         }
976
977         DPRINTF(("sqhead %u, tail %u\r\n", sqhead, sq->tail));
978         
979         while (sqhead != atomic_load_acq_short(&sq->tail)) {
980                 cmd = &(sq->qbase)[sqhead];
981                 compl.status = 0;
982
983                 switch (cmd->opc) {
984                 case NVME_OPC_DELETE_IO_SQ:
985                         DPRINTF(("%s command DELETE_IO_SQ\r\n", __func__));
986                         do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl);
987                         break;
988                 case NVME_OPC_CREATE_IO_SQ:
989                         DPRINTF(("%s command CREATE_IO_SQ\r\n", __func__));
990                         do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl);
991                         break;
992                 case NVME_OPC_DELETE_IO_CQ:
993                         DPRINTF(("%s command DELETE_IO_CQ\r\n", __func__));
994                         do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl);
995                         break;
996                 case NVME_OPC_CREATE_IO_CQ:
997                         DPRINTF(("%s command CREATE_IO_CQ\r\n", __func__));
998                         do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl);
999                         break;
1000                 case NVME_OPC_GET_LOG_PAGE:
1001                         DPRINTF(("%s command GET_LOG_PAGE\r\n", __func__));
1002                         do_intr |= nvme_opc_get_log_page(sc, cmd, &compl);
1003                         break;
1004                 case NVME_OPC_IDENTIFY:
1005                         DPRINTF(("%s command IDENTIFY\r\n", __func__));
1006                         do_intr |= nvme_opc_identify(sc, cmd, &compl);
1007                         break;
1008                 case NVME_OPC_ABORT:
1009                         DPRINTF(("%s command ABORT\r\n", __func__));
1010                         do_intr |= nvme_opc_abort(sc, cmd, &compl);
1011                         break;
1012                 case NVME_OPC_SET_FEATURES:
1013                         DPRINTF(("%s command SET_FEATURES\r\n", __func__));
1014                         do_intr |= nvme_opc_set_features(sc, cmd, &compl);
1015                         break;
1016                 case NVME_OPC_GET_FEATURES:
1017                         DPRINTF(("%s command GET_FEATURES\r\n", __func__));
1018                         do_intr |= nvme_opc_get_features(sc, cmd, &compl);
1019                         break;
1020                 case NVME_OPC_ASYNC_EVENT_REQUEST:
1021                         DPRINTF(("%s command ASYNC_EVENT_REQ\r\n", __func__));
1022                         /* XXX dont care, unhandled for now
1023                         do_intr |= nvme_opc_async_event_req(sc, cmd, &compl);
1024                         */
1025                         break;
1026                 default:
1027                         WPRINTF(("0x%x command is not implemented\r\n",
1028                             cmd->opc));
1029                 }
1030         
1031                 /* for now skip async event generation */
1032                 if (cmd->opc != NVME_OPC_ASYNC_EVENT_REQUEST) {
1033                         struct nvme_completion *cp;
1034                         int phase;
1035
1036                         cq = &sc->compl_queues[0];
1037
1038                         cp = &(cq->qbase)[cq->tail];
1039                         cp->cdw0 = compl.cdw0;
1040                         cp->sqid = 0;
1041                         cp->sqhd = sqhead;
1042                         cp->cid = cmd->cid;
1043
1044                         phase = NVME_STATUS_GET_P(cp->status);
1045                         cp->status = compl.status;
1046                         pci_nvme_toggle_phase(&cp->status, phase);
1047
1048                         cq->tail = (cq->tail + 1) % cq->size;
1049                 }
1050                 sqhead = (sqhead + 1) % sq->size;
1051         }
1052
1053         DPRINTF(("setting sqhead %u\r\n", sqhead));
1054         atomic_store_short(&sq->head, sqhead);
1055         atomic_store_int(&sq->busy, 0);
1056
1057         if (do_intr)
1058                 pci_generate_msix(sc->nsc_pi, 0);
1059
1060 }
1061
1062 static int
1063 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1064         uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1065 {
1066         int iovidx;
1067
1068         if (req != NULL) {
1069                 /* concatenate contig block-iovs to minimize number of iovs */
1070                 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1071                         iovidx = req->io_req.br_iovcnt - 1;
1072
1073                         req->io_req.br_iov[iovidx].iov_base =
1074                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1075                                              req->prev_gpaddr, size);
1076
1077                         req->prev_size += size;
1078                         req->io_req.br_resid += size;
1079
1080                         req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1081                 } else {
1082                         pthread_mutex_lock(&req->mtx);
1083
1084                         iovidx = req->io_req.br_iovcnt;
1085                         if (iovidx == NVME_MAX_BLOCKIOVS) {
1086                                 int err = 0;
1087
1088                                 DPRINTF(("large I/O, doing partial req\r\n"));
1089
1090                                 iovidx = 0;
1091                                 req->io_req.br_iovcnt = 0;
1092
1093                                 req->io_req.br_callback = pci_nvme_io_partial;
1094
1095                                 if (!do_write)
1096                                         err = blockif_read(sc->nvstore.ctx,
1097                                                            &req->io_req);
1098                                 else
1099                                         err = blockif_write(sc->nvstore.ctx,
1100                                                             &req->io_req);
1101
1102                                 /* wait until req completes before cont */
1103                                 if (err == 0)
1104                                         pthread_cond_wait(&req->cv, &req->mtx);
1105                         }
1106                         if (iovidx == 0) {
1107                                 req->io_req.br_offset = lba;
1108                                 req->io_req.br_resid = 0;
1109                                 req->io_req.br_param = req;
1110                         }
1111
1112                         req->io_req.br_iov[iovidx].iov_base =
1113                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1114                                              gpaddr, size);
1115
1116                         req->io_req.br_iov[iovidx].iov_len = size;
1117
1118                         req->prev_gpaddr = gpaddr;
1119                         req->prev_size = size;
1120                         req->io_req.br_resid += size;
1121
1122                         req->io_req.br_iovcnt++;
1123
1124                         pthread_mutex_unlock(&req->mtx);
1125                 }
1126         } else {
1127                 /* RAM buffer: read/write directly */
1128                 void *p = sc->nvstore.ctx;
1129                 void *gptr;
1130
1131                 if ((lba + size) > sc->nvstore.size) {
1132                         WPRINTF(("%s write would overflow RAM\r\n", __func__));
1133                         return (-1);
1134                 }
1135
1136                 p = (void *)((uintptr_t)p + (uintptr_t)lba);
1137                 gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1138                 if (do_write) 
1139                         memcpy(p, gptr, size);
1140                 else
1141                         memcpy(gptr, p, size);
1142         }
1143         return (0);
1144 }
1145
1146 static void
1147 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1148         struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1149         uint32_t cdw0, uint16_t status, int ignore_busy)
1150 {
1151         struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1152         struct nvme_completion *compl;
1153         int do_intr = 0;
1154         int phase;
1155
1156         DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x\r\n",
1157                  __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1158                  NVME_STATUS_GET_SC(status)));
1159
1160         pthread_mutex_lock(&cq->mtx);
1161
1162         assert(cq->qbase != NULL);
1163
1164         compl = &cq->qbase[cq->tail];
1165
1166         compl->sqhd = atomic_load_acq_short(&sq->head);
1167         compl->sqid = sqid;
1168         compl->cid = cid;
1169
1170         // toggle phase
1171         phase = NVME_STATUS_GET_P(compl->status);
1172         compl->status = status;
1173         pci_nvme_toggle_phase(&compl->status, phase);
1174
1175         cq->tail = (cq->tail + 1) % cq->size;
1176
1177         if (cq->intr_en & NVME_CQ_INTEN)
1178                 do_intr = 1;
1179
1180         pthread_mutex_unlock(&cq->mtx);
1181
1182         if (ignore_busy || !atomic_load_acq_int(&sq->busy))
1183                 if (do_intr)
1184                         pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1185 }
1186
1187 static void
1188 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1189 {
1190         req->sc = NULL;
1191         req->nvme_sq = NULL;
1192         req->sqid = 0;
1193
1194         pthread_mutex_lock(&sc->mtx);
1195
1196         req->next = sc->ioreqs_free;
1197         sc->ioreqs_free = req;
1198         sc->pending_ios--;
1199
1200         /* when no more IO pending, can set to ready if device reset/enabled */
1201         if (sc->pending_ios == 0 &&
1202             NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1203                 sc->regs.csts |= NVME_CSTS_RDY;
1204
1205         pthread_mutex_unlock(&sc->mtx);
1206
1207         sem_post(&sc->iosemlock);
1208 }
1209
1210 static struct pci_nvme_ioreq *
1211 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1212 {
1213         struct pci_nvme_ioreq *req = NULL;;
1214
1215         sem_wait(&sc->iosemlock);
1216         pthread_mutex_lock(&sc->mtx);
1217
1218         req = sc->ioreqs_free;
1219         assert(req != NULL);
1220
1221         sc->ioreqs_free = req->next;
1222
1223         req->next = NULL;
1224         req->sc = sc;
1225
1226         sc->pending_ios++;
1227
1228         pthread_mutex_unlock(&sc->mtx);
1229
1230         req->io_req.br_iovcnt = 0;
1231         req->io_req.br_offset = 0;
1232         req->io_req.br_resid = 0;
1233         req->io_req.br_param = req;
1234         req->prev_gpaddr = 0;
1235         req->prev_size = 0;
1236
1237         return req;
1238 }
1239
1240 static void
1241 pci_nvme_io_done(struct blockif_req *br, int err)
1242 {
1243         struct pci_nvme_ioreq *req = br->br_param;
1244         struct nvme_submission_queue *sq = req->nvme_sq;
1245         uint16_t code, status;
1246
1247         DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1248         
1249         /* TODO return correct error */
1250         code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1251         pci_nvme_status_genc(&status, code);
1252
1253         pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
1254         pci_nvme_release_ioreq(req->sc, req);
1255 }
1256
1257 static void
1258 pci_nvme_io_partial(struct blockif_req *br, int err)
1259 {
1260         struct pci_nvme_ioreq *req = br->br_param;
1261
1262         DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1263
1264         pthread_cond_signal(&req->cv);
1265 }
1266
1267
1268 static void
1269 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1270 {
1271         struct nvme_submission_queue *sq;
1272         uint16_t status;
1273         uint16_t sqhead;
1274         int err;
1275
1276         /* handle all submissions up to sq->tail index */
1277         sq = &sc->submit_queues[idx];
1278
1279         if (atomic_testandset_int(&sq->busy, 1)) {
1280                 DPRINTF(("%s sqid %u busy\r\n", __func__, idx));
1281                 return;
1282         }
1283
1284         sqhead = atomic_load_acq_short(&sq->head);
1285
1286         DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p\r\n",
1287                  idx, sqhead, sq->tail, sq->qbase));
1288
1289         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1290                 struct nvme_command *cmd;
1291                 struct pci_nvme_ioreq *req = NULL;
1292                 uint64_t lba;
1293                 uint64_t nblocks, bytes, size, cpsz;
1294
1295                 /* TODO: support scatter gather list handling */
1296
1297                 cmd = &sq->qbase[sqhead];
1298                 sqhead = (sqhead + 1) % sq->size;
1299
1300                 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1301
1302                 if (cmd->opc == NVME_OPC_FLUSH) {
1303                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1304                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1305                                                 status, 1);
1306
1307                         continue;
1308                 } else if (cmd->opc == 0x08) {
1309                         /* TODO: write zeroes */
1310                         WPRINTF(("%s write zeroes lba 0x%lx blocks %u\r\n",
1311                                 __func__, lba, cmd->cdw12 & 0xFFFF));
1312                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1313                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1314                                                 status, 1);
1315
1316                         continue;
1317                 }
1318
1319                 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1320
1321                 bytes = nblocks * sc->nvstore.sectsz;
1322
1323                 if (sc->nvstore.type == NVME_STOR_BLOCKIF) {
1324                         req = pci_nvme_get_ioreq(sc);
1325                         req->nvme_sq = sq;
1326                         req->sqid = idx;
1327                 }
1328
1329                 /*
1330                  * If data starts mid-page and flows into the next page, then
1331                  * increase page count
1332                  */
1333
1334                 DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu "
1335                          "(%lu-bytes)\r\n",
1336                          sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size,
1337                          cmd->opc == NVME_OPC_WRITE ?
1338                              "WRITE" : "READ",
1339                          lba, nblocks, bytes));
1340
1341                 cmd->prp1 &= ~(0x03UL);
1342                 cmd->prp2 &= ~(0x03UL);
1343
1344                 DPRINTF((" prp1 0x%lx prp2 0x%lx\r\n", cmd->prp1, cmd->prp2));
1345
1346                 size = bytes;
1347                 lba *= sc->nvstore.sectsz;
1348
1349                 cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE);
1350
1351                 if (cpsz > bytes)
1352                         cpsz = bytes;
1353
1354                 if (req != NULL) {
1355                         req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) |
1356                                                 cmd->cdw10;
1357                         req->opc = cmd->opc;
1358                         req->cid = cmd->cid;
1359                         req->nsid = cmd->nsid;
1360                 }
1361
1362                 err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz,
1363                     cmd->opc == NVME_OPC_WRITE, lba);
1364                 lba += cpsz;
1365                 size -= cpsz;
1366
1367                 if (size == 0)
1368                         goto iodone;
1369
1370                 if (size <= PAGE_SIZE) {
1371                         /* prp2 is second (and final) page in transfer */
1372
1373                         err = pci_nvme_append_iov_req(sc, req, cmd->prp2,
1374                             size,
1375                             cmd->opc == NVME_OPC_WRITE,
1376                             lba);
1377                 } else {
1378                         uint64_t *prp_list;
1379                         int i;
1380
1381                         /* prp2 is pointer to a physical region page list */
1382                         prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx,
1383                                                     cmd->prp2, PAGE_SIZE);
1384
1385                         i = 0;
1386                         while (size != 0) {
1387                                 cpsz = MIN(size, PAGE_SIZE);
1388
1389                                 /*
1390                                  * Move to linked physical region page list
1391                                  * in last item.
1392                                  */ 
1393                                 if (i == (NVME_PRP2_ITEMS-1) &&
1394                                     size > PAGE_SIZE) {
1395                                         assert((prp_list[i] & (PAGE_SIZE-1)) == 0);
1396                                         prp_list = paddr_guest2host(
1397                                                       sc->nsc_pi->pi_vmctx,
1398                                                       prp_list[i], PAGE_SIZE);
1399                                         i = 0;
1400                                 }
1401                                 if (prp_list[i] == 0) {
1402                                         WPRINTF(("PRP2[%d] = 0 !!!\r\n", i));
1403                                         err = 1;
1404                                         break;
1405                                 }
1406
1407                                 err = pci_nvme_append_iov_req(sc, req,
1408                                     prp_list[i], cpsz,
1409                                     cmd->opc == NVME_OPC_WRITE, lba);
1410                                 if (err)
1411                                         break;
1412
1413                                 lba += cpsz;
1414                                 size -= cpsz;
1415                                 i++;
1416                         }
1417                 }
1418
1419 iodone:
1420                 if (sc->nvstore.type == NVME_STOR_RAM) {
1421                         uint16_t code, status;
1422
1423                         code = err ? NVME_SC_LBA_OUT_OF_RANGE :
1424                             NVME_SC_SUCCESS;
1425                         pci_nvme_status_genc(&status, code);
1426
1427                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1428                                                 status, 1);
1429
1430                         continue;
1431                 }
1432
1433
1434                 if (err)
1435                         goto do_error;
1436
1437                 req->io_req.br_callback = pci_nvme_io_done;
1438
1439                 err = 0;
1440                 switch (cmd->opc) {
1441                 case NVME_OPC_READ:
1442                         err = blockif_read(sc->nvstore.ctx, &req->io_req);
1443                         break;
1444                 case NVME_OPC_WRITE:
1445                         err = blockif_write(sc->nvstore.ctx, &req->io_req);
1446                         break;
1447                 default:
1448                         WPRINTF(("%s unhandled io command 0x%x\r\n",
1449                                  __func__, cmd->opc));
1450                         err = 1;
1451                 }
1452
1453 do_error:
1454                 if (err) {
1455                         uint16_t status;
1456
1457                         pci_nvme_status_genc(&status,
1458                             NVME_SC_DATA_TRANSFER_ERROR);
1459
1460                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1461                                                 status, 1);
1462                         pci_nvme_release_ioreq(sc, req);
1463                 }
1464         }
1465
1466         atomic_store_short(&sq->head, sqhead);
1467         atomic_store_int(&sq->busy, 0);
1468 }
1469
1470 static void
1471 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1472         uint64_t idx, int is_sq, uint64_t value)
1473 {
1474         DPRINTF(("nvme doorbell %lu, %s, val 0x%lx\r\n",
1475                 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF));
1476
1477         if (is_sq) {
1478                 atomic_store_short(&sc->submit_queues[idx].tail,
1479                                    (uint16_t)value);
1480
1481                 if (idx == 0) {
1482                         pci_nvme_handle_admin_cmd(sc, value);
1483                 } else {
1484                         /* submission queue; handle new entries in SQ */
1485                         if (idx > sc->num_squeues) {
1486                                 WPRINTF(("%s SQ index %lu overflow from "
1487                                          "guest (max %u)\r\n",
1488                                          __func__, idx, sc->num_squeues));
1489                                 return;
1490                         }
1491                         pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1492                 }
1493         } else {
1494                 if (idx > sc->num_cqueues) {
1495                         WPRINTF(("%s queue index %lu overflow from "
1496                                  "guest (max %u)\r\n",
1497                                  __func__, idx, sc->num_cqueues));
1498                         return;
1499                 }
1500
1501                 sc->compl_queues[idx].head = (uint16_t)value;
1502         }
1503 }
1504
1505 static void
1506 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1507 {
1508         const char *s = iswrite ? "WRITE" : "READ";
1509
1510         switch (offset) {
1511         case NVME_CR_CAP_LOW:
1512                 DPRINTF(("%s %s NVME_CR_CAP_LOW\r\n", func, s));
1513                 break;
1514         case NVME_CR_CAP_HI:
1515                 DPRINTF(("%s %s NVME_CR_CAP_HI\r\n", func, s));
1516                 break;
1517         case NVME_CR_VS:
1518                 DPRINTF(("%s %s NVME_CR_VS\r\n", func, s));
1519                 break;
1520         case NVME_CR_INTMS:
1521                 DPRINTF(("%s %s NVME_CR_INTMS\r\n", func, s));
1522                 break;
1523         case NVME_CR_INTMC:
1524                 DPRINTF(("%s %s NVME_CR_INTMC\r\n", func, s));
1525                 break;
1526         case NVME_CR_CC:
1527                 DPRINTF(("%s %s NVME_CR_CC\r\n", func, s));
1528                 break;
1529         case NVME_CR_CSTS:
1530                 DPRINTF(("%s %s NVME_CR_CSTS\r\n", func, s));
1531                 break;
1532         case NVME_CR_NSSR:
1533                 DPRINTF(("%s %s NVME_CR_NSSR\r\n", func, s));
1534                 break;
1535         case NVME_CR_AQA:
1536                 DPRINTF(("%s %s NVME_CR_AQA\r\n", func, s));
1537                 break;
1538         case NVME_CR_ASQ_LOW:
1539                 DPRINTF(("%s %s NVME_CR_ASQ_LOW\r\n", func, s));
1540                 break;
1541         case NVME_CR_ASQ_HI:
1542                 DPRINTF(("%s %s NVME_CR_ASQ_HI\r\n", func, s));
1543                 break;
1544         case NVME_CR_ACQ_LOW:
1545                 DPRINTF(("%s %s NVME_CR_ACQ_LOW\r\n", func, s));
1546                 break;
1547         case NVME_CR_ACQ_HI:
1548                 DPRINTF(("%s %s NVME_CR_ACQ_HI\r\n", func, s));
1549                 break;
1550         default:
1551                 DPRINTF(("unknown nvme bar-0 offset 0x%lx\r\n", offset));
1552         }
1553
1554 }
1555
1556 static void
1557 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1558         uint64_t offset, int size, uint64_t value)
1559 {
1560         uint32_t ccreg;
1561
1562         if (offset >= NVME_DOORBELL_OFFSET) {
1563                 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1564                 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1565                 int is_sq = (belloffset % 8) < 4;
1566
1567                 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1568                         WPRINTF(("guest attempted an overflow write offset "
1569                                  "0x%lx, val 0x%lx in %s",
1570                                  offset, value, __func__));
1571                         return;
1572                 }
1573
1574                 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1575                 return;
1576         }
1577
1578         DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx\r\n",
1579                 offset, size, value));
1580
1581         if (size != 4) {
1582                 WPRINTF(("guest wrote invalid size %d (offset 0x%lx, "
1583                          "val 0x%lx) to bar0 in %s",
1584                          size, offset, value, __func__));
1585                 /* TODO: shutdown device */
1586                 return;
1587         }
1588
1589         pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1590
1591         pthread_mutex_lock(&sc->mtx);
1592
1593         switch (offset) {
1594         case NVME_CR_CAP_LOW:
1595         case NVME_CR_CAP_HI:
1596                 /* readonly */
1597                 break;
1598         case NVME_CR_VS:
1599                 /* readonly */
1600                 break;
1601         case NVME_CR_INTMS:
1602                 /* MSI-X, so ignore */
1603                 break;
1604         case NVME_CR_INTMC:
1605                 /* MSI-X, so ignore */
1606                 break;
1607         case NVME_CR_CC:
1608                 ccreg = (uint32_t)value;
1609
1610                 DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1611                          "iocqes %u\r\n",
1612                         __func__,
1613                          NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1614                          NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1615                          NVME_CC_GET_IOCQES(ccreg)));
1616
1617                 if (NVME_CC_GET_SHN(ccreg)) {
1618                         /* perform shutdown - flush out data to backend */
1619                         sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1620                             NVME_CSTS_REG_SHST_SHIFT);
1621                         sc->regs.csts |= NVME_SHST_COMPLETE <<
1622                             NVME_CSTS_REG_SHST_SHIFT;
1623                 }
1624                 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1625                         if (NVME_CC_GET_EN(ccreg) == 0)
1626                                 /* transition 1-> causes controller reset */
1627                                 pci_nvme_reset_locked(sc);
1628                         else
1629                                 pci_nvme_init_controller(ctx, sc);
1630                 }
1631
1632                 /* Insert the iocqes, iosqes and en bits from the write */
1633                 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1634                 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1635                 if (NVME_CC_GET_EN(ccreg) == 0) {
1636                         /* Insert the ams, mps and css bit fields */
1637                         sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1638                         sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1639                         sc->regs.csts &= ~NVME_CSTS_RDY;
1640                 } else if (sc->pending_ios == 0) {
1641                         sc->regs.csts |= NVME_CSTS_RDY;
1642                 }
1643                 break;
1644         case NVME_CR_CSTS:
1645                 break;
1646         case NVME_CR_NSSR:
1647                 /* ignore writes; don't support subsystem reset */
1648                 break;
1649         case NVME_CR_AQA:
1650                 sc->regs.aqa = (uint32_t)value;
1651                 break;
1652         case NVME_CR_ASQ_LOW:
1653                 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
1654                                (0xFFFFF000 & value);
1655                 break;
1656         case NVME_CR_ASQ_HI:
1657                 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
1658                                (value << 32);
1659                 break;
1660         case NVME_CR_ACQ_LOW:
1661                 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
1662                                (0xFFFFF000 & value);
1663                 break;
1664         case NVME_CR_ACQ_HI:
1665                 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
1666                                (value << 32);
1667                 break;
1668         default:
1669                 DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d\r\n",
1670                          __func__, offset, value, size));
1671         }
1672         pthread_mutex_unlock(&sc->mtx);
1673 }
1674
1675 static void
1676 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
1677                 int baridx, uint64_t offset, int size, uint64_t value)
1678 {
1679         struct pci_nvme_softc* sc = pi->pi_arg;
1680
1681         if (baridx == pci_msix_table_bar(pi) ||
1682             baridx == pci_msix_pba_bar(pi)) {
1683                 DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, "
1684                          " value 0x%lx\r\n", baridx, offset, size, value));
1685
1686                 pci_emul_msix_twrite(pi, offset, size, value);
1687                 return;
1688         }
1689
1690         switch (baridx) {
1691         case 0:
1692                 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
1693                 break;
1694
1695         default:
1696                 DPRINTF(("%s unknown baridx %d, val 0x%lx\r\n",
1697                          __func__, baridx, value));
1698         }
1699 }
1700
1701 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
1702         uint64_t offset, int size)
1703 {
1704         uint64_t value;
1705
1706         pci_nvme_bar0_reg_dumps(__func__, offset, 0);
1707
1708         if (offset < NVME_DOORBELL_OFFSET) {
1709                 void *p = &(sc->regs);
1710                 pthread_mutex_lock(&sc->mtx);
1711                 memcpy(&value, (void *)((uintptr_t)p + offset), size);
1712                 pthread_mutex_unlock(&sc->mtx);
1713         } else {
1714                 value = 0;
1715                 WPRINTF(("pci_nvme: read invalid offset %ld\r\n", offset));
1716         }
1717
1718         switch (size) {
1719         case 1:
1720                 value &= 0xFF;
1721                 break;
1722         case 2:
1723                 value &= 0xFFFF;
1724                 break;
1725         case 4:
1726                 value &= 0xFFFFFFFF;
1727                 break;
1728         }
1729
1730         DPRINTF(("   nvme-read offset 0x%lx, size %d -> value 0x%x\r\n",
1731                  offset, size, (uint32_t)value));
1732
1733         return (value);
1734 }
1735
1736
1737
1738 static uint64_t
1739 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
1740     uint64_t offset, int size)
1741 {
1742         struct pci_nvme_softc* sc = pi->pi_arg;
1743
1744         if (baridx == pci_msix_table_bar(pi) ||
1745             baridx == pci_msix_pba_bar(pi)) {
1746                 DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d\r\n",
1747                         baridx, offset, size));
1748
1749                 return pci_emul_msix_tread(pi, offset, size);
1750         }
1751
1752         switch (baridx) {
1753         case 0:
1754                 return pci_nvme_read_bar_0(sc, offset, size);
1755
1756         default:
1757                 DPRINTF(("unknown bar %d, 0x%lx\r\n", baridx, offset));
1758         }
1759
1760         return (0);
1761 }
1762
1763
1764 static int
1765 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
1766 {
1767         char bident[sizeof("XX:X:X")];
1768         char    *uopt, *xopts, *config;
1769         uint32_t sectsz;
1770         int optidx;
1771
1772         sc->max_queues = NVME_QUEUES;
1773         sc->max_qentries = NVME_MAX_QENTRIES;
1774         sc->ioslots = NVME_IOSLOTS;
1775         sc->num_squeues = sc->max_queues;
1776         sc->num_cqueues = sc->max_queues;
1777         sectsz = 0;
1778
1779         uopt = strdup(opts);
1780         optidx = 0;
1781         snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
1782                  "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1783         for (xopts = strtok(uopt, ",");
1784              xopts != NULL;
1785              xopts = strtok(NULL, ",")) {
1786
1787                 if ((config = strchr(xopts, '=')) != NULL)
1788                         *config++ = '\0';
1789
1790                 if (!strcmp("maxq", xopts)) {
1791                         sc->max_queues = atoi(config);
1792                 } else if (!strcmp("qsz", xopts)) {
1793                         sc->max_qentries = atoi(config);
1794                 } else if (!strcmp("ioslots", xopts)) {
1795                         sc->ioslots = atoi(config);
1796                 } else if (!strcmp("sectsz", xopts)) {
1797                         sectsz = atoi(config);
1798                 } else if (!strcmp("ser", xopts)) {
1799                         /*
1800                          * This field indicates the Product Serial Number in
1801                          * 7-bit ASCII, unused bytes should be space characters.
1802                          * Ref: NVMe v1.3c.
1803                          */
1804                         cpywithpad((char *)sc->ctrldata.sn,
1805                                    sizeof(sc->ctrldata.sn), config, ' ');
1806                 } else if (!strcmp("ram", xopts)) {
1807                         uint64_t sz = strtoull(&xopts[4], NULL, 10);
1808
1809                         sc->nvstore.type = NVME_STOR_RAM;
1810                         sc->nvstore.size = sz * 1024 * 1024;
1811                         sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1812                         sc->nvstore.sectsz = 4096;
1813                         sc->nvstore.sectsz_bits = 12;
1814                         if (sc->nvstore.ctx == NULL) {
1815                                 perror("Unable to allocate RAM");
1816                                 free(uopt);
1817                                 return (-1);
1818                         }
1819                 } else if (optidx == 0) {
1820                         snprintf(bident, sizeof(bident), "%d:%d",
1821                                  sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1822                         sc->nvstore.ctx = blockif_open(xopts, bident);
1823                         if (sc->nvstore.ctx == NULL) {
1824                                 perror("Could not open backing file");
1825                                 free(uopt);
1826                                 return (-1);
1827                         }
1828                         sc->nvstore.type = NVME_STOR_BLOCKIF;
1829                         sc->nvstore.size = blockif_size(sc->nvstore.ctx);
1830                 } else {
1831                         fprintf(stderr, "Invalid option %s\n", xopts);
1832                         free(uopt);
1833                         return (-1);
1834                 }
1835
1836                 optidx++;
1837         }
1838         free(uopt);
1839
1840         if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
1841                 fprintf(stderr, "backing store not specified\n");
1842                 return (-1);
1843         }
1844         if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
1845                 sc->nvstore.sectsz = sectsz;
1846         else if (sc->nvstore.type != NVME_STOR_RAM)
1847                 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
1848         for (sc->nvstore.sectsz_bits = 9;
1849              (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
1850              sc->nvstore.sectsz_bits++);
1851
1852         if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
1853                 sc->max_queues = NVME_QUEUES;
1854
1855         if (sc->max_qentries <= 0) {
1856                 fprintf(stderr, "Invalid qsz option\n");
1857                 return (-1);
1858         }
1859         if (sc->ioslots <= 0) {
1860                 fprintf(stderr, "Invalid ioslots option\n");
1861                 return (-1);
1862         }
1863
1864         return (0);
1865 }
1866
1867 static int
1868 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
1869 {
1870         struct pci_nvme_softc *sc;
1871         uint32_t pci_membar_sz;
1872         int     error;
1873
1874         error = 0;
1875
1876         sc = calloc(1, sizeof(struct pci_nvme_softc));
1877         pi->pi_arg = sc;
1878         sc->nsc_pi = pi;
1879
1880         error = pci_nvme_parse_opts(sc, opts);
1881         if (error < 0)
1882                 goto done;
1883         else
1884                 error = 0;
1885
1886         sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
1887         for (int i = 0; i < sc->ioslots; i++) {
1888                 if (i < (sc->ioslots-1))
1889                         sc->ioreqs[i].next = &sc->ioreqs[i+1];
1890                 pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
1891                 pthread_cond_init(&sc->ioreqs[i].cv, NULL);
1892         }
1893         sc->ioreqs_free = sc->ioreqs;
1894         sc->intr_coales_aggr_thresh = 1;
1895
1896         pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
1897         pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
1898         pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
1899         pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
1900         pci_set_cfgdata8(pi, PCIR_PROGIF,
1901                          PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
1902
1903         /*
1904          * Allocate size of NVMe registers + doorbell space for all queues.
1905          *
1906          * The specification requires a minimum memory I/O window size of 16K.
1907          * The Windows driver will refuse to start a device with a smaller
1908          * window.
1909          */
1910         pci_membar_sz = sizeof(struct nvme_registers) +
1911             2 * sizeof(uint32_t) * (sc->max_queues + 1);
1912         pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
1913
1914         DPRINTF(("nvme membar size: %u\r\n", pci_membar_sz));
1915
1916         error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
1917         if (error) {
1918                 WPRINTF(("%s pci alloc mem bar failed\r\n", __func__));
1919                 goto done;
1920         }
1921
1922         error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
1923         if (error) {
1924                 WPRINTF(("%s pci add msixcap failed\r\n", __func__));
1925                 goto done;
1926         }
1927
1928         pthread_mutex_init(&sc->mtx, NULL);
1929         sem_init(&sc->iosemlock, 0, sc->ioslots);
1930
1931         pci_nvme_reset(sc);
1932         pci_nvme_init_ctrldata(sc);
1933         pci_nvme_init_nsdata(sc);
1934         pci_nvme_init_logpages(sc);
1935
1936         pci_lintr_request(pi);
1937
1938 done:
1939         return (error);
1940 }
1941
1942
1943 struct pci_devemu pci_de_nvme = {
1944         .pe_emu =       "nvme",
1945         .pe_init =      pci_nvme_init,
1946         .pe_barwrite =  pci_nvme_write,
1947         .pe_barread =   pci_nvme_read
1948 };
1949 PCI_EMUL_SET(pci_de_nvme);