]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/bhyve/pci_nvme.c
usr.sbin/bhyve: free resources when erroring out of pci_vtcon_sock_add()
[FreeBSD/FreeBSD.git] / usr.sbin / bhyve / pci_nvme.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28
29 /*
30  * bhyve PCIe-NVMe device emulation.
31  *
32  * options:
33  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z
34  *
35  *  accepted devpath:
36  *    /dev/blockdev
37  *    /path/to/image
38  *    ram=size_in_MiB
39  *
40  *  maxq    = max number of queues
41  *  qsz     = max elements in each queue
42  *  ioslots = max number of concurrent io requests
43  *  sectsz  = sector size (defaults to blockif sector size)
44  *  ser     = serial number (20-chars max)
45  *
46  */
47
48 /* TODO:
49     - create async event for smart and log
50     - intr coalesce
51  */
52
53 #include <sys/cdefs.h>
54 __FBSDID("$FreeBSD$");
55
56 #include <sys/types.h>
57
58 #include <assert.h>
59 #include <pthread.h>
60 #include <semaphore.h>
61 #include <stdbool.h>
62 #include <stddef.h>
63 #include <stdint.h>
64 #include <stdio.h>
65 #include <stdlib.h>
66 #include <string.h>
67
68 #include <machine/atomic.h>
69 #include <machine/vmm.h>
70 #include <vmmapi.h>
71
72 #include <dev/nvme/nvme.h>
73
74 #include "bhyverun.h"
75 #include "block_if.h"
76 #include "pci_emul.h"
77
78
79 static int nvme_debug = 0;
80 #define DPRINTF(params) if (nvme_debug) printf params
81 #define WPRINTF(params) printf params
82
83 /* defaults; can be overridden */
84 #define NVME_MSIX_BAR           4
85
86 #define NVME_IOSLOTS            8
87
88 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
89 #define NVME_MMIO_SPACE_MIN     (1 << 14)
90
91 #define NVME_QUEUES             16
92 #define NVME_MAX_QENTRIES       2048
93
94 #define NVME_PRP2_ITEMS         (PAGE_SIZE/sizeof(uint64_t))
95 #define NVME_MAX_BLOCKIOVS      512
96
97 /* helpers */
98
99 /* Convert a zero-based value into a one-based value */
100 #define ONE_BASED(zero)         ((zero) + 1)
101 /* Convert a one-based value into a zero-based value */
102 #define ZERO_BASED(one)         ((one)  - 1)
103
104 /* Encode number of SQ's and CQ's for Set/Get Features */
105 #define NVME_FEATURE_NUM_QUEUES(sc) \
106         (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
107         (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
108
109 #define NVME_DOORBELL_OFFSET    offsetof(struct nvme_registers, doorbell)
110
111 enum nvme_controller_register_offsets {
112         NVME_CR_CAP_LOW = 0x00,
113         NVME_CR_CAP_HI  = 0x04,
114         NVME_CR_VS      = 0x08,
115         NVME_CR_INTMS   = 0x0c,
116         NVME_CR_INTMC   = 0x10,
117         NVME_CR_CC      = 0x14,
118         NVME_CR_CSTS    = 0x1c,
119         NVME_CR_NSSR    = 0x20,
120         NVME_CR_AQA     = 0x24,
121         NVME_CR_ASQ_LOW = 0x28,
122         NVME_CR_ASQ_HI  = 0x2c,
123         NVME_CR_ACQ_LOW = 0x30,
124         NVME_CR_ACQ_HI  = 0x34,
125 };
126
127 enum nvme_cmd_cdw11 {
128         NVME_CMD_CDW11_PC  = 0x0001,
129         NVME_CMD_CDW11_IEN = 0x0002,
130         NVME_CMD_CDW11_IV  = 0xFFFF0000,
131 };
132
133 #define NVME_CQ_INTEN   0x01
134 #define NVME_CQ_INTCOAL 0x02
135
136 struct nvme_completion_queue {
137         struct nvme_completion *qbase;
138         uint32_t        size;
139         uint16_t        tail; /* nvme progress */
140         uint16_t        head; /* guest progress */
141         uint16_t        intr_vec;
142         uint32_t        intr_en;
143         pthread_mutex_t mtx;
144 };
145
146 struct nvme_submission_queue {
147         struct nvme_command *qbase;
148         uint32_t        size;
149         uint16_t        head; /* nvme progress */
150         uint16_t        tail; /* guest progress */
151         uint16_t        cqid; /* completion queue id */
152         int             busy; /* queue is being processed */
153         int             qpriority;
154 };
155
156 enum nvme_storage_type {
157         NVME_STOR_BLOCKIF = 0,
158         NVME_STOR_RAM = 1,
159 };
160
161 struct pci_nvme_blockstore {
162         enum nvme_storage_type type;
163         void            *ctx;
164         uint64_t        size;
165         uint32_t        sectsz;
166         uint32_t        sectsz_bits;
167 };
168
169 struct pci_nvme_ioreq {
170         struct pci_nvme_softc *sc;
171         struct pci_nvme_ioreq *next;
172         struct nvme_submission_queue *nvme_sq;
173         uint16_t        sqid;
174
175         /* command information */
176         uint16_t        opc;
177         uint16_t        cid;
178         uint32_t        nsid;
179
180         uint64_t        prev_gpaddr;
181         size_t          prev_size;
182
183         /*
184          * lock if all iovs consumed (big IO);
185          * complete transaction before continuing
186          */
187         pthread_mutex_t mtx;
188         pthread_cond_t  cv;
189
190         struct blockif_req io_req;
191
192         /* pad to fit up to 512 page descriptors from guest IO request */
193         struct iovec    iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
194 };
195
196 struct pci_nvme_softc {
197         struct pci_devinst *nsc_pi;
198
199         pthread_mutex_t mtx;
200
201         struct nvme_registers regs;
202
203         struct nvme_namespace_data  nsdata;
204         struct nvme_controller_data ctrldata;
205         struct nvme_error_information_entry err_log;
206         struct nvme_health_information_page health_log;
207         struct nvme_firmware_page fw_log;
208
209         struct pci_nvme_blockstore nvstore;
210
211         uint16_t        max_qentries;   /* max entries per queue */
212         uint32_t        max_queues;     /* max number of IO SQ's or CQ's */
213         uint32_t        num_cqueues;
214         uint32_t        num_squeues;
215
216         struct pci_nvme_ioreq *ioreqs;
217         struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */
218         uint32_t        pending_ios;
219         uint32_t        ioslots;
220         sem_t           iosemlock;
221
222         /*
223          * Memory mapped Submission and Completion queues
224          * Each array includes both Admin and IO queues
225          */
226         struct nvme_completion_queue *compl_queues;
227         struct nvme_submission_queue *submit_queues;
228
229         /* controller features */
230         uint32_t        intr_coales_aggr_time;   /* 0x08: uS to delay intr */
231         uint32_t        intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
232         uint32_t        async_ev_config;         /* 0x0B: async event config */
233 };
234
235
236 static void pci_nvme_io_partial(struct blockif_req *br, int err);
237
238 /* Controller Configuration utils */
239 #define NVME_CC_GET_EN(cc) \
240         ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
241 #define NVME_CC_GET_CSS(cc) \
242         ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
243 #define NVME_CC_GET_SHN(cc) \
244         ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
245 #define NVME_CC_GET_IOSQES(cc) \
246         ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
247 #define NVME_CC_GET_IOCQES(cc) \
248         ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
249
250 #define NVME_CC_WRITE_MASK \
251         ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
252          (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
253          (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
254
255 #define NVME_CC_NEN_WRITE_MASK \
256         ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
257          (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
258          (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
259
260 /* Controller Status utils */
261 #define NVME_CSTS_GET_RDY(sts) \
262         ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
263
264 #define NVME_CSTS_RDY   (1 << NVME_CSTS_REG_RDY_SHIFT)
265
266 /* Completion Queue status word utils */
267 #define NVME_STATUS_P   (1 << NVME_STATUS_P_SHIFT)
268 #define NVME_STATUS_MASK \
269         ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
270          (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
271
272 static __inline void
273 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
274 {
275         size_t len;
276
277         len = strnlen(src, dst_size);
278         memset(dst, pad, dst_size);
279         memcpy(dst, src, len);
280 }
281
282 static __inline void
283 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
284 {
285
286         *status &= ~NVME_STATUS_MASK;
287         *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
288                 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
289 }
290
291 static __inline void
292 pci_nvme_status_genc(uint16_t *status, uint16_t code)
293 {
294
295         pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
296 }
297
298 static __inline void
299 pci_nvme_toggle_phase(uint16_t *status, int prev)
300 {
301
302         if (prev)
303                 *status &= ~NVME_STATUS_P;
304         else
305                 *status |= NVME_STATUS_P;
306 }
307
308 static void
309 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
310 {
311         struct nvme_controller_data *cd = &sc->ctrldata;
312
313         cd->vid = 0xFB5D;
314         cd->ssvid = 0x0000;
315
316         cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
317         cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
318
319         /* Num of submission commands that we can handle at a time (2^rab) */
320         cd->rab   = 4;
321
322         /* FreeBSD OUI */
323         cd->ieee[0] = 0x58;
324         cd->ieee[1] = 0x9c;
325         cd->ieee[2] = 0xfc;
326
327         cd->mic = 0;
328
329         cd->mdts = 9;   /* max data transfer size (2^mdts * CAP.MPSMIN) */
330
331         cd->ver = 0x00010300;
332
333         cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
334         cd->acl = 2;
335         cd->aerl = 4;
336
337         cd->lpa = 0;    /* TODO: support some simple things like SMART */
338         cd->elpe = 0;   /* max error log page entries */
339         cd->npss = 1;   /* number of power states support */
340
341         /* Warning Composite Temperature Threshold */
342         cd->wctemp = 0x0157;
343
344         cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
345             (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
346         cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
347             (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
348         cd->nn = 1;     /* number of namespaces */
349
350         cd->fna = 0x03;
351
352         cd->power_state[0].mp = 10;
353 }
354
355 static void
356 pci_nvme_init_nsdata(struct pci_nvme_softc *sc)
357 {
358         struct nvme_namespace_data *nd;
359
360         nd = &sc->nsdata;
361
362         nd->nsze = sc->nvstore.size / sc->nvstore.sectsz;
363         nd->ncap = nd->nsze;
364         nd->nuse = nd->nsze;
365
366         /* Get LBA and backstore information from backing store */
367         nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
368         /* LBA data-sz = 2^lbads */
369         nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
370
371         nd->flbas = 0;
372 }
373
374 static void
375 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
376 {
377
378         memset(&sc->err_log, 0, sizeof(sc->err_log));
379         memset(&sc->health_log, 0, sizeof(sc->health_log));
380         memset(&sc->fw_log, 0, sizeof(sc->fw_log));
381 }
382
383 static void
384 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
385 {
386         DPRINTF(("%s\r\n", __func__));
387
388         sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
389             (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
390             (60 << NVME_CAP_LO_REG_TO_SHIFT);
391
392         sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
393
394         sc->regs.vs = 0x00010300;       /* NVMe v1.3 */
395
396         sc->regs.cc = 0;
397         sc->regs.csts = 0;
398
399         sc->num_cqueues = sc->num_squeues = sc->max_queues;
400         if (sc->submit_queues != NULL) {
401                 for (int i = 0; i < sc->num_squeues + 1; i++) {
402                         /*
403                          * The Admin Submission Queue is at index 0.
404                          * It must not be changed at reset otherwise the
405                          * emulation will be out of sync with the guest.
406                          */
407                         if (i != 0) {
408                                 sc->submit_queues[i].qbase = NULL;
409                                 sc->submit_queues[i].size = 0;
410                                 sc->submit_queues[i].cqid = 0;
411                         }
412                         sc->submit_queues[i].tail = 0;
413                         sc->submit_queues[i].head = 0;
414                         sc->submit_queues[i].busy = 0;
415                 }
416         } else
417                 sc->submit_queues = calloc(sc->num_squeues + 1,
418                                         sizeof(struct nvme_submission_queue));
419
420         if (sc->compl_queues != NULL) {
421                 for (int i = 0; i < sc->num_cqueues + 1; i++) {
422                         /* See Admin Submission Queue note above */
423                         if (i != 0) {
424                                 sc->compl_queues[i].qbase = NULL;
425                                 sc->compl_queues[i].size = 0;
426                         }
427
428                         sc->compl_queues[i].tail = 0;
429                         sc->compl_queues[i].head = 0;
430                 }
431         } else {
432                 sc->compl_queues = calloc(sc->num_cqueues + 1,
433                                         sizeof(struct nvme_completion_queue));
434
435                 for (int i = 0; i < sc->num_cqueues + 1; i++)
436                         pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
437         }
438 }
439
440 static void
441 pci_nvme_reset(struct pci_nvme_softc *sc)
442 {
443         pthread_mutex_lock(&sc->mtx);
444         pci_nvme_reset_locked(sc);
445         pthread_mutex_unlock(&sc->mtx);
446 }
447
448 static void
449 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
450 {
451         uint16_t acqs, asqs;
452
453         DPRINTF(("%s\r\n", __func__));
454
455         asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
456         sc->submit_queues[0].size = asqs;
457         sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
458                     sizeof(struct nvme_command) * asqs);
459
460         DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p\r\n",
461                 __func__, sc->regs.asq, sc->submit_queues[0].qbase));
462
463         acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 
464             NVME_AQA_REG_ACQS_MASK) + 1;
465         sc->compl_queues[0].size = acqs;
466         sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
467                  sizeof(struct nvme_completion) * acqs);
468         DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p\r\n",
469                 __func__, sc->regs.acq, sc->compl_queues[0].qbase));
470 }
471
472 static int
473 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *src,
474         size_t len)
475 {
476         uint8_t *dst;
477         size_t bytes;
478
479         if (len > (8 * 1024)) {
480                 return (-1);
481         }
482
483         /* Copy from the start of prp1 to the end of the physical page */
484         bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
485         bytes = MIN(bytes, len);
486
487         dst = vm_map_gpa(ctx, prp1, bytes);
488         if (dst == NULL) {
489                 return (-1);
490         }
491
492         memcpy(dst, src, bytes);
493
494         src += bytes;
495
496         len -= bytes;
497         if (len == 0) {
498                 return (0);
499         }
500
501         len = MIN(len, PAGE_SIZE);
502
503         dst = vm_map_gpa(ctx, prp2, len);
504         if (dst == NULL) {
505                 return (-1);
506         }
507
508         memcpy(dst, src, len);
509
510         return (0);
511 }
512
513 static int
514 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
515         struct nvme_completion* compl)
516 {
517         uint16_t qid = command->cdw10 & 0xffff;
518
519         DPRINTF(("%s DELETE_IO_SQ %u\r\n", __func__, qid));
520         if (qid == 0 || qid > sc->num_squeues) {
521                 WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u\r\n",
522                         __func__, qid, sc->num_squeues));
523                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
524                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
525                 return (1);
526         }
527
528         sc->submit_queues[qid].qbase = NULL;
529         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
530         return (1);
531 }
532
533 static int
534 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
535         struct nvme_completion* compl)
536 {
537         if (command->cdw11 & NVME_CMD_CDW11_PC) {
538                 uint16_t qid = command->cdw10 & 0xffff;
539                 struct nvme_submission_queue *nsq;
540
541                 if ((qid == 0) || (qid > sc->num_squeues)) {
542                         WPRINTF(("%s queue index %u > num_squeues %u\r\n",
543                                 __func__, qid, sc->num_squeues));
544                         pci_nvme_status_tc(&compl->status,
545                             NVME_SCT_COMMAND_SPECIFIC,
546                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
547                         return (1);
548                 }
549
550                 nsq = &sc->submit_queues[qid];
551                 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
552
553                 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
554                               sizeof(struct nvme_command) * (size_t)nsq->size);
555                 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
556                 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
557
558                 DPRINTF(("%s sq %u size %u gaddr %p cqid %u\r\n", __func__,
559                         qid, nsq->size, nsq->qbase, nsq->cqid));
560
561                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
562
563                 DPRINTF(("%s completed creating IOSQ qid %u\r\n",
564                          __func__, qid));
565         } else {
566                 /* 
567                  * Guest sent non-cont submission queue request.
568                  * This setting is unsupported by this emulation.
569                  */
570                 WPRINTF(("%s unsupported non-contig (list-based) "
571                          "create i/o submission queue\r\n", __func__));
572
573                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
574         }
575         return (1);
576 }
577
578 static int
579 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
580         struct nvme_completion* compl)
581 {
582         uint16_t qid = command->cdw10 & 0xffff;
583
584         DPRINTF(("%s DELETE_IO_CQ %u\r\n", __func__, qid));
585         if (qid == 0 || qid > sc->num_cqueues) {
586                 WPRINTF(("%s queue index %u / num_cqueues %u\r\n",
587                         __func__, qid, sc->num_cqueues));
588                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
589                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
590                 return (1);
591         }
592
593         sc->compl_queues[qid].qbase = NULL;
594         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
595         return (1);
596 }
597
598 static int
599 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
600         struct nvme_completion* compl)
601 {
602         if (command->cdw11 & NVME_CMD_CDW11_PC) {
603                 uint16_t qid = command->cdw10 & 0xffff;
604                 struct nvme_completion_queue *ncq;
605
606                 if ((qid == 0) || (qid > sc->num_cqueues)) {
607                         WPRINTF(("%s queue index %u > num_cqueues %u\r\n",
608                                 __func__, qid, sc->num_cqueues));
609                         pci_nvme_status_tc(&compl->status,
610                             NVME_SCT_COMMAND_SPECIFIC,
611                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
612                         return (1);
613                 }
614
615                 ncq = &sc->compl_queues[qid];
616                 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
617                 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
618                 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
619
620                 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
621                              command->prp1,
622                              sizeof(struct nvme_command) * (size_t)ncq->size);
623
624                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
625         } else {
626                 /* 
627                  * Non-contig completion queue unsupported.
628                  */
629                 WPRINTF(("%s unsupported non-contig (list-based) "
630                          "create i/o completion queue\r\n",
631                          __func__));
632
633                 /* 0x12 = Invalid Use of Controller Memory Buffer */
634                 pci_nvme_status_genc(&compl->status, 0x12);
635         }
636
637         return (1);
638 }
639
640 static int
641 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
642         struct nvme_completion* compl)
643 {
644         uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
645         uint8_t logpage = command->cdw10 & 0xFF;
646
647         DPRINTF(("%s log page %u len %u\r\n", __func__, logpage, logsize));
648
649         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
650
651         switch (logpage) {
652         case NVME_LOG_ERROR:
653                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
654                     command->prp2, (uint8_t *)&sc->err_log, logsize);
655                 break;
656         case NVME_LOG_HEALTH_INFORMATION:
657                 /* TODO: present some smart info */
658                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
659                     command->prp2, (uint8_t *)&sc->health_log, logsize);
660                 break;
661         case NVME_LOG_FIRMWARE_SLOT:
662                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
663                     command->prp2, (uint8_t *)&sc->fw_log, logsize);
664                 break;
665         default:
666                 WPRINTF(("%s get log page %x command not supported\r\n",
667                         __func__, logpage));
668
669                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
670                     NVME_SC_INVALID_LOG_PAGE);
671         }
672
673         return (1);
674 }
675
676 static int
677 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
678         struct nvme_completion* compl)
679 {
680         void *dest;
681
682         DPRINTF(("%s identify 0x%x nsid 0x%x\r\n", __func__,
683                 command->cdw10 & 0xFF, command->nsid));
684
685         switch (command->cdw10 & 0xFF) {
686         case 0x00: /* return Identify Namespace data structure */
687                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
688                     command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata));
689                 break;
690         case 0x01: /* return Identify Controller data structure */
691                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
692                     command->prp2, (uint8_t *)&sc->ctrldata,
693                     sizeof(sc->ctrldata));
694                 break;
695         case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
696                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
697                                   sizeof(uint32_t) * 1024);
698                 ((uint32_t *)dest)[0] = 1;
699                 ((uint32_t *)dest)[1] = 0;
700                 break;
701         case 0x11:
702                 pci_nvme_status_genc(&compl->status,
703                     NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
704                 return (1);
705         case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
706         case 0x10:
707         case 0x12:
708         case 0x13:
709         case 0x14:
710         case 0x15:
711         default:
712                 DPRINTF(("%s unsupported identify command requested 0x%x\r\n",
713                          __func__, command->cdw10 & 0xFF));
714                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
715                 return (1);
716         }
717
718         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
719         return (1);
720 }
721
722 static int
723 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
724         struct nvme_completion* compl)
725 {
726         uint16_t nqr;   /* Number of Queues Requested */
727
728         nqr = command->cdw11 & 0xFFFF;
729         if (nqr == 0xffff) {
730                 WPRINTF(("%s: Illegal NSQR value %#x\n", __func__, nqr));
731                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
732                 return (-1);
733         }
734
735         sc->num_squeues = ONE_BASED(nqr);
736         if (sc->num_squeues > sc->max_queues) {
737                 DPRINTF(("NSQR=%u is greater than max %u\n", sc->num_squeues,
738                                         sc->max_queues));
739                 sc->num_squeues = sc->max_queues;
740         }
741
742         nqr = (command->cdw11 >> 16) & 0xFFFF;
743         if (nqr == 0xffff) {
744                 WPRINTF(("%s: Illegal NCQR value %#x\n", __func__, nqr));
745                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
746                 return (-1);
747         }
748
749         sc->num_cqueues = ONE_BASED(nqr);
750         if (sc->num_cqueues > sc->max_queues) {
751                 DPRINTF(("NCQR=%u is greater than max %u\n", sc->num_cqueues,
752                                         sc->max_queues));
753                 sc->num_cqueues = sc->max_queues;
754         }
755
756         compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
757
758         return (0);
759 }
760
761 static int
762 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
763         struct nvme_completion* compl)
764 {
765         int feature = command->cdw10 & 0xFF;
766         uint32_t iv;
767
768         DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
769         compl->cdw0 = 0;
770
771         switch (feature) {
772         case NVME_FEAT_ARBITRATION:
773                 DPRINTF(("  arbitration 0x%x\r\n", command->cdw11));
774                 break;
775         case NVME_FEAT_POWER_MANAGEMENT:
776                 DPRINTF(("  power management 0x%x\r\n", command->cdw11));
777                 break;
778         case NVME_FEAT_LBA_RANGE_TYPE:
779                 DPRINTF(("  lba range 0x%x\r\n", command->cdw11));
780                 break;
781         case NVME_FEAT_TEMPERATURE_THRESHOLD:
782                 DPRINTF(("  temperature threshold 0x%x\r\n", command->cdw11));
783                 break;
784         case NVME_FEAT_ERROR_RECOVERY:
785                 DPRINTF(("  error recovery 0x%x\r\n", command->cdw11));
786                 break;
787         case NVME_FEAT_VOLATILE_WRITE_CACHE:
788                 DPRINTF(("  volatile write cache 0x%x\r\n", command->cdw11));
789                 break;
790         case NVME_FEAT_NUMBER_OF_QUEUES:
791                 nvme_set_feature_queues(sc, command, compl);
792                 break;
793         case NVME_FEAT_INTERRUPT_COALESCING:
794                 DPRINTF(("  interrupt coalescing 0x%x\r\n", command->cdw11));
795
796                 /* in uS */
797                 sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
798
799                 sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
800                 break;
801         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
802                 iv = command->cdw11 & 0xFFFF;
803
804                 DPRINTF(("  interrupt vector configuration 0x%x\r\n",
805                         command->cdw11));
806
807                 for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
808                         if (sc->compl_queues[i].intr_vec == iv) {
809                                 if (command->cdw11 & (1 << 16))
810                                         sc->compl_queues[i].intr_en |=
811                                                               NVME_CQ_INTCOAL;  
812                                 else
813                                         sc->compl_queues[i].intr_en &=
814                                                              ~NVME_CQ_INTCOAL;  
815                         }
816                 }
817                 break;
818         case NVME_FEAT_WRITE_ATOMICITY:
819                 DPRINTF(("  write atomicity 0x%x\r\n", command->cdw11));
820                 break;
821         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
822                 DPRINTF(("  async event configuration 0x%x\r\n",
823                         command->cdw11));
824                 sc->async_ev_config = command->cdw11;
825                 break;
826         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
827                 DPRINTF(("  software progress marker 0x%x\r\n",
828                         command->cdw11));
829                 break;
830         case 0x0C:
831                 DPRINTF(("  autonomous power state transition 0x%x\r\n",
832                         command->cdw11));
833                 break;
834         default:
835                 WPRINTF(("%s invalid feature\r\n", __func__));
836                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
837                 return (1);
838         }
839
840         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
841         return (1);
842 }
843
844 static int
845 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
846         struct nvme_completion* compl)
847 {
848         int feature = command->cdw10 & 0xFF;
849
850         DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
851
852         compl->cdw0 = 0;
853
854         switch (feature) {
855         case NVME_FEAT_ARBITRATION:
856                 DPRINTF(("  arbitration\r\n"));
857                 break;
858         case NVME_FEAT_POWER_MANAGEMENT:
859                 DPRINTF(("  power management\r\n"));
860                 break;
861         case NVME_FEAT_LBA_RANGE_TYPE:
862                 DPRINTF(("  lba range\r\n"));
863                 break;
864         case NVME_FEAT_TEMPERATURE_THRESHOLD:
865                 DPRINTF(("  temperature threshold\r\n"));
866                 switch ((command->cdw11 >> 20) & 0x3) {
867                 case 0:
868                         /* Over temp threshold */
869                         compl->cdw0 = 0xFFFF;
870                         break;
871                 case 1:
872                         /* Under temp threshold */
873                         compl->cdw0 = 0;
874                         break;
875                 default:
876                         WPRINTF(("  invalid threshold type select\r\n"));
877                         pci_nvme_status_genc(&compl->status,
878                             NVME_SC_INVALID_FIELD);
879                         return (1);
880                 }
881                 break;
882         case NVME_FEAT_ERROR_RECOVERY:
883                 DPRINTF(("  error recovery\r\n"));
884                 break;
885         case NVME_FEAT_VOLATILE_WRITE_CACHE:
886                 DPRINTF(("  volatile write cache\r\n"));
887                 break;
888         case NVME_FEAT_NUMBER_OF_QUEUES:
889                 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
890
891                 DPRINTF(("  number of queues (submit %u, completion %u)\r\n",
892                         compl->cdw0 & 0xFFFF,
893                         (compl->cdw0 >> 16) & 0xFFFF));
894
895                 break;
896         case NVME_FEAT_INTERRUPT_COALESCING:
897                 DPRINTF(("  interrupt coalescing\r\n"));
898                 break;
899         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
900                 DPRINTF(("  interrupt vector configuration\r\n"));
901                 break;
902         case NVME_FEAT_WRITE_ATOMICITY:
903                 DPRINTF(("  write atomicity\r\n"));
904                 break;
905         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
906                 DPRINTF(("  async event configuration\r\n"));
907                 sc->async_ev_config = command->cdw11;
908                 break;
909         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
910                 DPRINTF(("  software progress marker\r\n"));
911                 break;
912         case 0x0C:
913                 DPRINTF(("  autonomous power state transition\r\n"));
914                 break;
915         default:
916                 WPRINTF(("%s invalid feature 0x%x\r\n", __func__, feature));
917                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
918                 return (1);
919         }
920
921         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
922         return (1);
923 }
924
925 static int
926 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
927         struct nvme_completion* compl)
928 {
929         DPRINTF(("%s submission queue %u, command ID 0x%x\r\n", __func__,
930                 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF));
931
932         /* TODO: search for the command ID and abort it */
933
934         compl->cdw0 = 1;
935         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
936         return (1);
937 }
938
939 static int
940 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
941         struct nvme_command* command, struct nvme_completion* compl)
942 {
943         DPRINTF(("%s async event request 0x%x\r\n", __func__, command->cdw11));
944
945         /*
946          * TODO: raise events when they happen based on the Set Features cmd.
947          * These events happen async, so only set completion successful if
948          * there is an event reflective of the request to get event.
949          */
950         pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
951             NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
952         return (0);
953 }
954
955 static void
956 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
957 {
958         struct nvme_completion compl;
959         struct nvme_command *cmd;
960         struct nvme_submission_queue *sq;
961         struct nvme_completion_queue *cq;
962         int do_intr = 0;
963         uint16_t sqhead;
964
965         DPRINTF(("%s index %u\r\n", __func__, (uint32_t)value));
966
967         sq = &sc->submit_queues[0];
968
969         sqhead = atomic_load_acq_short(&sq->head);
970
971         if (atomic_testandset_int(&sq->busy, 1)) {
972                 DPRINTF(("%s SQ busy, head %u, tail %u\r\n",
973                         __func__, sqhead, sq->tail));
974                 return;
975         }
976
977         DPRINTF(("sqhead %u, tail %u\r\n", sqhead, sq->tail));
978         
979         while (sqhead != atomic_load_acq_short(&sq->tail)) {
980                 cmd = &(sq->qbase)[sqhead];
981                 compl.cdw0 = 0;
982                 compl.status = 0;
983
984                 switch (cmd->opc) {
985                 case NVME_OPC_DELETE_IO_SQ:
986                         DPRINTF(("%s command DELETE_IO_SQ\r\n", __func__));
987                         do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl);
988                         break;
989                 case NVME_OPC_CREATE_IO_SQ:
990                         DPRINTF(("%s command CREATE_IO_SQ\r\n", __func__));
991                         do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl);
992                         break;
993                 case NVME_OPC_DELETE_IO_CQ:
994                         DPRINTF(("%s command DELETE_IO_CQ\r\n", __func__));
995                         do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl);
996                         break;
997                 case NVME_OPC_CREATE_IO_CQ:
998                         DPRINTF(("%s command CREATE_IO_CQ\r\n", __func__));
999                         do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl);
1000                         break;
1001                 case NVME_OPC_GET_LOG_PAGE:
1002                         DPRINTF(("%s command GET_LOG_PAGE\r\n", __func__));
1003                         do_intr |= nvme_opc_get_log_page(sc, cmd, &compl);
1004                         break;
1005                 case NVME_OPC_IDENTIFY:
1006                         DPRINTF(("%s command IDENTIFY\r\n", __func__));
1007                         do_intr |= nvme_opc_identify(sc, cmd, &compl);
1008                         break;
1009                 case NVME_OPC_ABORT:
1010                         DPRINTF(("%s command ABORT\r\n", __func__));
1011                         do_intr |= nvme_opc_abort(sc, cmd, &compl);
1012                         break;
1013                 case NVME_OPC_SET_FEATURES:
1014                         DPRINTF(("%s command SET_FEATURES\r\n", __func__));
1015                         do_intr |= nvme_opc_set_features(sc, cmd, &compl);
1016                         break;
1017                 case NVME_OPC_GET_FEATURES:
1018                         DPRINTF(("%s command GET_FEATURES\r\n", __func__));
1019                         do_intr |= nvme_opc_get_features(sc, cmd, &compl);
1020                         break;
1021                 case NVME_OPC_ASYNC_EVENT_REQUEST:
1022                         DPRINTF(("%s command ASYNC_EVENT_REQ\r\n", __func__));
1023                         /* XXX dont care, unhandled for now
1024                         do_intr |= nvme_opc_async_event_req(sc, cmd, &compl);
1025                         */
1026                         break;
1027                 default:
1028                         WPRINTF(("0x%x command is not implemented\r\n",
1029                             cmd->opc));
1030                 }
1031         
1032                 /* for now skip async event generation */
1033                 if (cmd->opc != NVME_OPC_ASYNC_EVENT_REQUEST) {
1034                         struct nvme_completion *cp;
1035                         int phase;
1036
1037                         cq = &sc->compl_queues[0];
1038
1039                         cp = &(cq->qbase)[cq->tail];
1040                         cp->cdw0 = compl.cdw0;
1041                         cp->sqid = 0;
1042                         cp->sqhd = sqhead;
1043                         cp->cid = cmd->cid;
1044
1045                         phase = NVME_STATUS_GET_P(cp->status);
1046                         cp->status = compl.status;
1047                         pci_nvme_toggle_phase(&cp->status, phase);
1048
1049                         cq->tail = (cq->tail + 1) % cq->size;
1050                 }
1051                 sqhead = (sqhead + 1) % sq->size;
1052         }
1053
1054         DPRINTF(("setting sqhead %u\r\n", sqhead));
1055         atomic_store_short(&sq->head, sqhead);
1056         atomic_store_int(&sq->busy, 0);
1057
1058         if (do_intr)
1059                 pci_generate_msix(sc->nsc_pi, 0);
1060
1061 }
1062
1063 static int
1064 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1065         uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1066 {
1067         int iovidx;
1068
1069         if (req != NULL) {
1070                 /* concatenate contig block-iovs to minimize number of iovs */
1071                 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1072                         iovidx = req->io_req.br_iovcnt - 1;
1073
1074                         req->io_req.br_iov[iovidx].iov_base =
1075                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1076                                              req->prev_gpaddr, size);
1077
1078                         req->prev_size += size;
1079                         req->io_req.br_resid += size;
1080
1081                         req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1082                 } else {
1083                         pthread_mutex_lock(&req->mtx);
1084
1085                         iovidx = req->io_req.br_iovcnt;
1086                         if (iovidx == NVME_MAX_BLOCKIOVS) {
1087                                 int err = 0;
1088
1089                                 DPRINTF(("large I/O, doing partial req\r\n"));
1090
1091                                 iovidx = 0;
1092                                 req->io_req.br_iovcnt = 0;
1093
1094                                 req->io_req.br_callback = pci_nvme_io_partial;
1095
1096                                 if (!do_write)
1097                                         err = blockif_read(sc->nvstore.ctx,
1098                                                            &req->io_req);
1099                                 else
1100                                         err = blockif_write(sc->nvstore.ctx,
1101                                                             &req->io_req);
1102
1103                                 /* wait until req completes before cont */
1104                                 if (err == 0)
1105                                         pthread_cond_wait(&req->cv, &req->mtx);
1106                         }
1107                         if (iovidx == 0) {
1108                                 req->io_req.br_offset = lba;
1109                                 req->io_req.br_resid = 0;
1110                                 req->io_req.br_param = req;
1111                         }
1112
1113                         req->io_req.br_iov[iovidx].iov_base =
1114                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1115                                              gpaddr, size);
1116
1117                         req->io_req.br_iov[iovidx].iov_len = size;
1118
1119                         req->prev_gpaddr = gpaddr;
1120                         req->prev_size = size;
1121                         req->io_req.br_resid += size;
1122
1123                         req->io_req.br_iovcnt++;
1124
1125                         pthread_mutex_unlock(&req->mtx);
1126                 }
1127         } else {
1128                 /* RAM buffer: read/write directly */
1129                 void *p = sc->nvstore.ctx;
1130                 void *gptr;
1131
1132                 if ((lba + size) > sc->nvstore.size) {
1133                         WPRINTF(("%s write would overflow RAM\r\n", __func__));
1134                         return (-1);
1135                 }
1136
1137                 p = (void *)((uintptr_t)p + (uintptr_t)lba);
1138                 gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1139                 if (do_write) 
1140                         memcpy(p, gptr, size);
1141                 else
1142                         memcpy(gptr, p, size);
1143         }
1144         return (0);
1145 }
1146
1147 static void
1148 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1149         struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1150         uint32_t cdw0, uint16_t status, int ignore_busy)
1151 {
1152         struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1153         struct nvme_completion *compl;
1154         int do_intr = 0;
1155         int phase;
1156
1157         DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x\r\n",
1158                  __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1159                  NVME_STATUS_GET_SC(status)));
1160
1161         pthread_mutex_lock(&cq->mtx);
1162
1163         assert(cq->qbase != NULL);
1164
1165         compl = &cq->qbase[cq->tail];
1166
1167         compl->sqhd = atomic_load_acq_short(&sq->head);
1168         compl->sqid = sqid;
1169         compl->cid = cid;
1170
1171         // toggle phase
1172         phase = NVME_STATUS_GET_P(compl->status);
1173         compl->status = status;
1174         pci_nvme_toggle_phase(&compl->status, phase);
1175
1176         cq->tail = (cq->tail + 1) % cq->size;
1177
1178         if (cq->intr_en & NVME_CQ_INTEN)
1179                 do_intr = 1;
1180
1181         pthread_mutex_unlock(&cq->mtx);
1182
1183         if (ignore_busy || !atomic_load_acq_int(&sq->busy))
1184                 if (do_intr)
1185                         pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1186 }
1187
1188 static void
1189 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1190 {
1191         req->sc = NULL;
1192         req->nvme_sq = NULL;
1193         req->sqid = 0;
1194
1195         pthread_mutex_lock(&sc->mtx);
1196
1197         req->next = sc->ioreqs_free;
1198         sc->ioreqs_free = req;
1199         sc->pending_ios--;
1200
1201         /* when no more IO pending, can set to ready if device reset/enabled */
1202         if (sc->pending_ios == 0 &&
1203             NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1204                 sc->regs.csts |= NVME_CSTS_RDY;
1205
1206         pthread_mutex_unlock(&sc->mtx);
1207
1208         sem_post(&sc->iosemlock);
1209 }
1210
1211 static struct pci_nvme_ioreq *
1212 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1213 {
1214         struct pci_nvme_ioreq *req = NULL;;
1215
1216         sem_wait(&sc->iosemlock);
1217         pthread_mutex_lock(&sc->mtx);
1218
1219         req = sc->ioreqs_free;
1220         assert(req != NULL);
1221
1222         sc->ioreqs_free = req->next;
1223
1224         req->next = NULL;
1225         req->sc = sc;
1226
1227         sc->pending_ios++;
1228
1229         pthread_mutex_unlock(&sc->mtx);
1230
1231         req->io_req.br_iovcnt = 0;
1232         req->io_req.br_offset = 0;
1233         req->io_req.br_resid = 0;
1234         req->io_req.br_param = req;
1235         req->prev_gpaddr = 0;
1236         req->prev_size = 0;
1237
1238         return req;
1239 }
1240
1241 static void
1242 pci_nvme_io_done(struct blockif_req *br, int err)
1243 {
1244         struct pci_nvme_ioreq *req = br->br_param;
1245         struct nvme_submission_queue *sq = req->nvme_sq;
1246         uint16_t code, status;
1247
1248         DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1249         
1250         /* TODO return correct error */
1251         code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1252         pci_nvme_status_genc(&status, code);
1253
1254         pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
1255         pci_nvme_release_ioreq(req->sc, req);
1256 }
1257
1258 static void
1259 pci_nvme_io_partial(struct blockif_req *br, int err)
1260 {
1261         struct pci_nvme_ioreq *req = br->br_param;
1262
1263         DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1264
1265         pthread_cond_signal(&req->cv);
1266 }
1267
1268
1269 static void
1270 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1271 {
1272         struct nvme_submission_queue *sq;
1273         uint16_t status;
1274         uint16_t sqhead;
1275         int err;
1276
1277         /* handle all submissions up to sq->tail index */
1278         sq = &sc->submit_queues[idx];
1279
1280         if (atomic_testandset_int(&sq->busy, 1)) {
1281                 DPRINTF(("%s sqid %u busy\r\n", __func__, idx));
1282                 return;
1283         }
1284
1285         sqhead = atomic_load_acq_short(&sq->head);
1286
1287         DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p\r\n",
1288                  idx, sqhead, sq->tail, sq->qbase));
1289
1290         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1291                 struct nvme_command *cmd;
1292                 struct pci_nvme_ioreq *req = NULL;
1293                 uint64_t lba;
1294                 uint64_t nblocks, bytes, size, cpsz;
1295
1296                 /* TODO: support scatter gather list handling */
1297
1298                 cmd = &sq->qbase[sqhead];
1299                 sqhead = (sqhead + 1) % sq->size;
1300
1301                 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1302
1303                 if (cmd->opc == NVME_OPC_FLUSH) {
1304                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1305                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1306                                                 status, 1);
1307
1308                         continue;
1309                 } else if (cmd->opc == 0x08) {
1310                         /* TODO: write zeroes */
1311                         WPRINTF(("%s write zeroes lba 0x%lx blocks %u\r\n",
1312                                 __func__, lba, cmd->cdw12 & 0xFFFF));
1313                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1314                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1315                                                 status, 1);
1316
1317                         continue;
1318                 }
1319
1320                 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1321
1322                 bytes = nblocks * sc->nvstore.sectsz;
1323
1324                 if (sc->nvstore.type == NVME_STOR_BLOCKIF) {
1325                         req = pci_nvme_get_ioreq(sc);
1326                         req->nvme_sq = sq;
1327                         req->sqid = idx;
1328                 }
1329
1330                 /*
1331                  * If data starts mid-page and flows into the next page, then
1332                  * increase page count
1333                  */
1334
1335                 DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu "
1336                          "(%lu-bytes)\r\n",
1337                          sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size,
1338                          cmd->opc == NVME_OPC_WRITE ?
1339                              "WRITE" : "READ",
1340                          lba, nblocks, bytes));
1341
1342                 cmd->prp1 &= ~(0x03UL);
1343                 cmd->prp2 &= ~(0x03UL);
1344
1345                 DPRINTF((" prp1 0x%lx prp2 0x%lx\r\n", cmd->prp1, cmd->prp2));
1346
1347                 size = bytes;
1348                 lba *= sc->nvstore.sectsz;
1349
1350                 cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE);
1351
1352                 if (cpsz > bytes)
1353                         cpsz = bytes;
1354
1355                 if (req != NULL) {
1356                         req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) |
1357                                                 cmd->cdw10;
1358                         req->opc = cmd->opc;
1359                         req->cid = cmd->cid;
1360                         req->nsid = cmd->nsid;
1361                 }
1362
1363                 err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz,
1364                     cmd->opc == NVME_OPC_WRITE, lba);
1365                 lba += cpsz;
1366                 size -= cpsz;
1367
1368                 if (size == 0)
1369                         goto iodone;
1370
1371                 if (size <= PAGE_SIZE) {
1372                         /* prp2 is second (and final) page in transfer */
1373
1374                         err = pci_nvme_append_iov_req(sc, req, cmd->prp2,
1375                             size,
1376                             cmd->opc == NVME_OPC_WRITE,
1377                             lba);
1378                 } else {
1379                         uint64_t *prp_list;
1380                         int i;
1381
1382                         /* prp2 is pointer to a physical region page list */
1383                         prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx,
1384                                                     cmd->prp2, PAGE_SIZE);
1385
1386                         i = 0;
1387                         while (size != 0) {
1388                                 cpsz = MIN(size, PAGE_SIZE);
1389
1390                                 /*
1391                                  * Move to linked physical region page list
1392                                  * in last item.
1393                                  */ 
1394                                 if (i == (NVME_PRP2_ITEMS-1) &&
1395                                     size > PAGE_SIZE) {
1396                                         assert((prp_list[i] & (PAGE_SIZE-1)) == 0);
1397                                         prp_list = paddr_guest2host(
1398                                                       sc->nsc_pi->pi_vmctx,
1399                                                       prp_list[i], PAGE_SIZE);
1400                                         i = 0;
1401                                 }
1402                                 if (prp_list[i] == 0) {
1403                                         WPRINTF(("PRP2[%d] = 0 !!!\r\n", i));
1404                                         err = 1;
1405                                         break;
1406                                 }
1407
1408                                 err = pci_nvme_append_iov_req(sc, req,
1409                                     prp_list[i], cpsz,
1410                                     cmd->opc == NVME_OPC_WRITE, lba);
1411                                 if (err)
1412                                         break;
1413
1414                                 lba += cpsz;
1415                                 size -= cpsz;
1416                                 i++;
1417                         }
1418                 }
1419
1420 iodone:
1421                 if (sc->nvstore.type == NVME_STOR_RAM) {
1422                         uint16_t code, status;
1423
1424                         code = err ? NVME_SC_LBA_OUT_OF_RANGE :
1425                             NVME_SC_SUCCESS;
1426                         pci_nvme_status_genc(&status, code);
1427
1428                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1429                                                 status, 1);
1430
1431                         continue;
1432                 }
1433
1434
1435                 if (err)
1436                         goto do_error;
1437
1438                 req->io_req.br_callback = pci_nvme_io_done;
1439
1440                 err = 0;
1441                 switch (cmd->opc) {
1442                 case NVME_OPC_READ:
1443                         err = blockif_read(sc->nvstore.ctx, &req->io_req);
1444                         break;
1445                 case NVME_OPC_WRITE:
1446                         err = blockif_write(sc->nvstore.ctx, &req->io_req);
1447                         break;
1448                 default:
1449                         WPRINTF(("%s unhandled io command 0x%x\r\n",
1450                                  __func__, cmd->opc));
1451                         err = 1;
1452                 }
1453
1454 do_error:
1455                 if (err) {
1456                         uint16_t status;
1457
1458                         pci_nvme_status_genc(&status,
1459                             NVME_SC_DATA_TRANSFER_ERROR);
1460
1461                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1462                                                 status, 1);
1463                         pci_nvme_release_ioreq(sc, req);
1464                 }
1465         }
1466
1467         atomic_store_short(&sq->head, sqhead);
1468         atomic_store_int(&sq->busy, 0);
1469 }
1470
1471 static void
1472 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1473         uint64_t idx, int is_sq, uint64_t value)
1474 {
1475         DPRINTF(("nvme doorbell %lu, %s, val 0x%lx\r\n",
1476                 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF));
1477
1478         if (is_sq) {
1479                 atomic_store_short(&sc->submit_queues[idx].tail,
1480                                    (uint16_t)value);
1481
1482                 if (idx == 0) {
1483                         pci_nvme_handle_admin_cmd(sc, value);
1484                 } else {
1485                         /* submission queue; handle new entries in SQ */
1486                         if (idx > sc->num_squeues) {
1487                                 WPRINTF(("%s SQ index %lu overflow from "
1488                                          "guest (max %u)\r\n",
1489                                          __func__, idx, sc->num_squeues));
1490                                 return;
1491                         }
1492                         pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1493                 }
1494         } else {
1495                 if (idx > sc->num_cqueues) {
1496                         WPRINTF(("%s queue index %lu overflow from "
1497                                  "guest (max %u)\r\n",
1498                                  __func__, idx, sc->num_cqueues));
1499                         return;
1500                 }
1501
1502                 sc->compl_queues[idx].head = (uint16_t)value;
1503         }
1504 }
1505
1506 static void
1507 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1508 {
1509         const char *s = iswrite ? "WRITE" : "READ";
1510
1511         switch (offset) {
1512         case NVME_CR_CAP_LOW:
1513                 DPRINTF(("%s %s NVME_CR_CAP_LOW\r\n", func, s));
1514                 break;
1515         case NVME_CR_CAP_HI:
1516                 DPRINTF(("%s %s NVME_CR_CAP_HI\r\n", func, s));
1517                 break;
1518         case NVME_CR_VS:
1519                 DPRINTF(("%s %s NVME_CR_VS\r\n", func, s));
1520                 break;
1521         case NVME_CR_INTMS:
1522                 DPRINTF(("%s %s NVME_CR_INTMS\r\n", func, s));
1523                 break;
1524         case NVME_CR_INTMC:
1525                 DPRINTF(("%s %s NVME_CR_INTMC\r\n", func, s));
1526                 break;
1527         case NVME_CR_CC:
1528                 DPRINTF(("%s %s NVME_CR_CC\r\n", func, s));
1529                 break;
1530         case NVME_CR_CSTS:
1531                 DPRINTF(("%s %s NVME_CR_CSTS\r\n", func, s));
1532                 break;
1533         case NVME_CR_NSSR:
1534                 DPRINTF(("%s %s NVME_CR_NSSR\r\n", func, s));
1535                 break;
1536         case NVME_CR_AQA:
1537                 DPRINTF(("%s %s NVME_CR_AQA\r\n", func, s));
1538                 break;
1539         case NVME_CR_ASQ_LOW:
1540                 DPRINTF(("%s %s NVME_CR_ASQ_LOW\r\n", func, s));
1541                 break;
1542         case NVME_CR_ASQ_HI:
1543                 DPRINTF(("%s %s NVME_CR_ASQ_HI\r\n", func, s));
1544                 break;
1545         case NVME_CR_ACQ_LOW:
1546                 DPRINTF(("%s %s NVME_CR_ACQ_LOW\r\n", func, s));
1547                 break;
1548         case NVME_CR_ACQ_HI:
1549                 DPRINTF(("%s %s NVME_CR_ACQ_HI\r\n", func, s));
1550                 break;
1551         default:
1552                 DPRINTF(("unknown nvme bar-0 offset 0x%lx\r\n", offset));
1553         }
1554
1555 }
1556
1557 static void
1558 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1559         uint64_t offset, int size, uint64_t value)
1560 {
1561         uint32_t ccreg;
1562
1563         if (offset >= NVME_DOORBELL_OFFSET) {
1564                 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1565                 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1566                 int is_sq = (belloffset % 8) < 4;
1567
1568                 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1569                         WPRINTF(("guest attempted an overflow write offset "
1570                                  "0x%lx, val 0x%lx in %s",
1571                                  offset, value, __func__));
1572                         return;
1573                 }
1574
1575                 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1576                 return;
1577         }
1578
1579         DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx\r\n",
1580                 offset, size, value));
1581
1582         if (size != 4) {
1583                 WPRINTF(("guest wrote invalid size %d (offset 0x%lx, "
1584                          "val 0x%lx) to bar0 in %s",
1585                          size, offset, value, __func__));
1586                 /* TODO: shutdown device */
1587                 return;
1588         }
1589
1590         pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1591
1592         pthread_mutex_lock(&sc->mtx);
1593
1594         switch (offset) {
1595         case NVME_CR_CAP_LOW:
1596         case NVME_CR_CAP_HI:
1597                 /* readonly */
1598                 break;
1599         case NVME_CR_VS:
1600                 /* readonly */
1601                 break;
1602         case NVME_CR_INTMS:
1603                 /* MSI-X, so ignore */
1604                 break;
1605         case NVME_CR_INTMC:
1606                 /* MSI-X, so ignore */
1607                 break;
1608         case NVME_CR_CC:
1609                 ccreg = (uint32_t)value;
1610
1611                 DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1612                          "iocqes %u\r\n",
1613                         __func__,
1614                          NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1615                          NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1616                          NVME_CC_GET_IOCQES(ccreg)));
1617
1618                 if (NVME_CC_GET_SHN(ccreg)) {
1619                         /* perform shutdown - flush out data to backend */
1620                         sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1621                             NVME_CSTS_REG_SHST_SHIFT);
1622                         sc->regs.csts |= NVME_SHST_COMPLETE <<
1623                             NVME_CSTS_REG_SHST_SHIFT;
1624                 }
1625                 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1626                         if (NVME_CC_GET_EN(ccreg) == 0)
1627                                 /* transition 1-> causes controller reset */
1628                                 pci_nvme_reset_locked(sc);
1629                         else
1630                                 pci_nvme_init_controller(ctx, sc);
1631                 }
1632
1633                 /* Insert the iocqes, iosqes and en bits from the write */
1634                 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1635                 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1636                 if (NVME_CC_GET_EN(ccreg) == 0) {
1637                         /* Insert the ams, mps and css bit fields */
1638                         sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1639                         sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1640                         sc->regs.csts &= ~NVME_CSTS_RDY;
1641                 } else if (sc->pending_ios == 0) {
1642                         sc->regs.csts |= NVME_CSTS_RDY;
1643                 }
1644                 break;
1645         case NVME_CR_CSTS:
1646                 break;
1647         case NVME_CR_NSSR:
1648                 /* ignore writes; don't support subsystem reset */
1649                 break;
1650         case NVME_CR_AQA:
1651                 sc->regs.aqa = (uint32_t)value;
1652                 break;
1653         case NVME_CR_ASQ_LOW:
1654                 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
1655                                (0xFFFFF000 & value);
1656                 break;
1657         case NVME_CR_ASQ_HI:
1658                 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
1659                                (value << 32);
1660                 break;
1661         case NVME_CR_ACQ_LOW:
1662                 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
1663                                (0xFFFFF000 & value);
1664                 break;
1665         case NVME_CR_ACQ_HI:
1666                 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
1667                                (value << 32);
1668                 break;
1669         default:
1670                 DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d\r\n",
1671                          __func__, offset, value, size));
1672         }
1673         pthread_mutex_unlock(&sc->mtx);
1674 }
1675
1676 static void
1677 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
1678                 int baridx, uint64_t offset, int size, uint64_t value)
1679 {
1680         struct pci_nvme_softc* sc = pi->pi_arg;
1681
1682         if (baridx == pci_msix_table_bar(pi) ||
1683             baridx == pci_msix_pba_bar(pi)) {
1684                 DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, "
1685                          " value 0x%lx\r\n", baridx, offset, size, value));
1686
1687                 pci_emul_msix_twrite(pi, offset, size, value);
1688                 return;
1689         }
1690
1691         switch (baridx) {
1692         case 0:
1693                 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
1694                 break;
1695
1696         default:
1697                 DPRINTF(("%s unknown baridx %d, val 0x%lx\r\n",
1698                          __func__, baridx, value));
1699         }
1700 }
1701
1702 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
1703         uint64_t offset, int size)
1704 {
1705         uint64_t value;
1706
1707         pci_nvme_bar0_reg_dumps(__func__, offset, 0);
1708
1709         if (offset < NVME_DOORBELL_OFFSET) {
1710                 void *p = &(sc->regs);
1711                 pthread_mutex_lock(&sc->mtx);
1712                 memcpy(&value, (void *)((uintptr_t)p + offset), size);
1713                 pthread_mutex_unlock(&sc->mtx);
1714         } else {
1715                 value = 0;
1716                 WPRINTF(("pci_nvme: read invalid offset %ld\r\n", offset));
1717         }
1718
1719         switch (size) {
1720         case 1:
1721                 value &= 0xFF;
1722                 break;
1723         case 2:
1724                 value &= 0xFFFF;
1725                 break;
1726         case 4:
1727                 value &= 0xFFFFFFFF;
1728                 break;
1729         }
1730
1731         DPRINTF(("   nvme-read offset 0x%lx, size %d -> value 0x%x\r\n",
1732                  offset, size, (uint32_t)value));
1733
1734         return (value);
1735 }
1736
1737
1738
1739 static uint64_t
1740 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
1741     uint64_t offset, int size)
1742 {
1743         struct pci_nvme_softc* sc = pi->pi_arg;
1744
1745         if (baridx == pci_msix_table_bar(pi) ||
1746             baridx == pci_msix_pba_bar(pi)) {
1747                 DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d\r\n",
1748                         baridx, offset, size));
1749
1750                 return pci_emul_msix_tread(pi, offset, size);
1751         }
1752
1753         switch (baridx) {
1754         case 0:
1755                 return pci_nvme_read_bar_0(sc, offset, size);
1756
1757         default:
1758                 DPRINTF(("unknown bar %d, 0x%lx\r\n", baridx, offset));
1759         }
1760
1761         return (0);
1762 }
1763
1764
1765 static int
1766 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
1767 {
1768         char bident[sizeof("XX:X:X")];
1769         char    *uopt, *xopts, *config;
1770         uint32_t sectsz;
1771         int optidx;
1772
1773         sc->max_queues = NVME_QUEUES;
1774         sc->max_qentries = NVME_MAX_QENTRIES;
1775         sc->ioslots = NVME_IOSLOTS;
1776         sc->num_squeues = sc->max_queues;
1777         sc->num_cqueues = sc->max_queues;
1778         sectsz = 0;
1779
1780         uopt = strdup(opts);
1781         optidx = 0;
1782         snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
1783                  "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1784         for (xopts = strtok(uopt, ",");
1785              xopts != NULL;
1786              xopts = strtok(NULL, ",")) {
1787
1788                 if ((config = strchr(xopts, '=')) != NULL)
1789                         *config++ = '\0';
1790
1791                 if (!strcmp("maxq", xopts)) {
1792                         sc->max_queues = atoi(config);
1793                 } else if (!strcmp("qsz", xopts)) {
1794                         sc->max_qentries = atoi(config);
1795                 } else if (!strcmp("ioslots", xopts)) {
1796                         sc->ioslots = atoi(config);
1797                 } else if (!strcmp("sectsz", xopts)) {
1798                         sectsz = atoi(config);
1799                 } else if (!strcmp("ser", xopts)) {
1800                         /*
1801                          * This field indicates the Product Serial Number in
1802                          * 7-bit ASCII, unused bytes should be space characters.
1803                          * Ref: NVMe v1.3c.
1804                          */
1805                         cpywithpad((char *)sc->ctrldata.sn,
1806                                    sizeof(sc->ctrldata.sn), config, ' ');
1807                 } else if (!strcmp("ram", xopts)) {
1808                         uint64_t sz = strtoull(&xopts[4], NULL, 10);
1809
1810                         sc->nvstore.type = NVME_STOR_RAM;
1811                         sc->nvstore.size = sz * 1024 * 1024;
1812                         sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1813                         sc->nvstore.sectsz = 4096;
1814                         sc->nvstore.sectsz_bits = 12;
1815                         if (sc->nvstore.ctx == NULL) {
1816                                 perror("Unable to allocate RAM");
1817                                 free(uopt);
1818                                 return (-1);
1819                         }
1820                 } else if (optidx == 0) {
1821                         snprintf(bident, sizeof(bident), "%d:%d",
1822                                  sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1823                         sc->nvstore.ctx = blockif_open(xopts, bident);
1824                         if (sc->nvstore.ctx == NULL) {
1825                                 perror("Could not open backing file");
1826                                 free(uopt);
1827                                 return (-1);
1828                         }
1829                         sc->nvstore.type = NVME_STOR_BLOCKIF;
1830                         sc->nvstore.size = blockif_size(sc->nvstore.ctx);
1831                 } else {
1832                         fprintf(stderr, "Invalid option %s\n", xopts);
1833                         free(uopt);
1834                         return (-1);
1835                 }
1836
1837                 optidx++;
1838         }
1839         free(uopt);
1840
1841         if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
1842                 fprintf(stderr, "backing store not specified\n");
1843                 return (-1);
1844         }
1845         if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
1846                 sc->nvstore.sectsz = sectsz;
1847         else if (sc->nvstore.type != NVME_STOR_RAM)
1848                 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
1849         for (sc->nvstore.sectsz_bits = 9;
1850              (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
1851              sc->nvstore.sectsz_bits++);
1852
1853         if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
1854                 sc->max_queues = NVME_QUEUES;
1855
1856         if (sc->max_qentries <= 0) {
1857                 fprintf(stderr, "Invalid qsz option\n");
1858                 return (-1);
1859         }
1860         if (sc->ioslots <= 0) {
1861                 fprintf(stderr, "Invalid ioslots option\n");
1862                 return (-1);
1863         }
1864
1865         return (0);
1866 }
1867
1868 static int
1869 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
1870 {
1871         struct pci_nvme_softc *sc;
1872         uint32_t pci_membar_sz;
1873         int     error;
1874
1875         error = 0;
1876
1877         sc = calloc(1, sizeof(struct pci_nvme_softc));
1878         pi->pi_arg = sc;
1879         sc->nsc_pi = pi;
1880
1881         error = pci_nvme_parse_opts(sc, opts);
1882         if (error < 0)
1883                 goto done;
1884         else
1885                 error = 0;
1886
1887         sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
1888         for (int i = 0; i < sc->ioslots; i++) {
1889                 if (i < (sc->ioslots-1))
1890                         sc->ioreqs[i].next = &sc->ioreqs[i+1];
1891                 pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
1892                 pthread_cond_init(&sc->ioreqs[i].cv, NULL);
1893         }
1894         sc->ioreqs_free = sc->ioreqs;
1895         sc->intr_coales_aggr_thresh = 1;
1896
1897         pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
1898         pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
1899         pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
1900         pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
1901         pci_set_cfgdata8(pi, PCIR_PROGIF,
1902                          PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
1903
1904         /*
1905          * Allocate size of NVMe registers + doorbell space for all queues.
1906          *
1907          * The specification requires a minimum memory I/O window size of 16K.
1908          * The Windows driver will refuse to start a device with a smaller
1909          * window.
1910          */
1911         pci_membar_sz = sizeof(struct nvme_registers) +
1912             2 * sizeof(uint32_t) * (sc->max_queues + 1);
1913         pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
1914
1915         DPRINTF(("nvme membar size: %u\r\n", pci_membar_sz));
1916
1917         error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
1918         if (error) {
1919                 WPRINTF(("%s pci alloc mem bar failed\r\n", __func__));
1920                 goto done;
1921         }
1922
1923         error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
1924         if (error) {
1925                 WPRINTF(("%s pci add msixcap failed\r\n", __func__));
1926                 goto done;
1927         }
1928
1929         error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
1930         if (error) {
1931                 WPRINTF(("%s pci add Express capability failed\r\n", __func__));
1932                 goto done;
1933         }
1934
1935         pthread_mutex_init(&sc->mtx, NULL);
1936         sem_init(&sc->iosemlock, 0, sc->ioslots);
1937
1938         pci_nvme_reset(sc);
1939         pci_nvme_init_ctrldata(sc);
1940         pci_nvme_init_nsdata(sc);
1941         pci_nvme_init_logpages(sc);
1942
1943         pci_lintr_request(pi);
1944
1945 done:
1946         return (error);
1947 }
1948
1949
1950 struct pci_devemu pci_de_nvme = {
1951         .pe_emu =       "nvme",
1952         .pe_init =      pci_nvme_init,
1953         .pe_barwrite =  pci_nvme_write,
1954         .pe_barread =   pci_nvme_read
1955 };
1956 PCI_EMUL_SET(pci_de_nvme);