]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/bhyve/pci_nvme.c
bhyve: base pci_nvme_ioreq size on advertised MDTS
[FreeBSD/FreeBSD.git] / usr.sbin / bhyve / pci_nvme.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov 
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65
66 #include <assert.h>
67 #include <pthread.h>
68 #include <semaphore.h>
69 #include <stdbool.h>
70 #include <stddef.h>
71 #include <stdint.h>
72 #include <stdio.h>
73 #include <stdlib.h>
74 #include <string.h>
75
76 #include <machine/atomic.h>
77 #include <machine/vmm.h>
78 #include <vmmapi.h>
79
80 #include <dev/nvme/nvme.h>
81
82 #include "bhyverun.h"
83 #include "block_if.h"
84 #include "debug.h"
85 #include "pci_emul.h"
86
87
88 static int nvme_debug = 0;
89 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
90 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
91
92 /* defaults; can be overridden */
93 #define NVME_MSIX_BAR           4
94
95 #define NVME_IOSLOTS            8
96
97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
98 #define NVME_MMIO_SPACE_MIN     (1 << 14)
99
100 #define NVME_QUEUES             16
101 #define NVME_MAX_QENTRIES       2048
102 /* Memory Page size Minimum reported in CAP register */
103 #define NVME_MPSMIN             0
104 /* MPSMIN converted to bytes */
105 #define NVME_MPSMIN_BYTES       (1 << (12 + NVME_MPSMIN))
106
107 #define NVME_PRP2_ITEMS         (PAGE_SIZE/sizeof(uint64_t))
108 #define NVME_MDTS               9
109 /* Note the + 1 allows for the initial descriptor to not be page aligned */
110 #define NVME_MAX_IOVEC          ((1 << NVME_MDTS) + 1)
111 #define NVME_MAX_DATA_SIZE      ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
112
113 /* This is a synthetic status code to indicate there is no status */
114 #define NVME_NO_STATUS          0xffff
115 #define NVME_COMPLETION_VALID(c)        ((c).status != NVME_NO_STATUS)
116
117 /* helpers */
118
119 /* Convert a zero-based value into a one-based value */
120 #define ONE_BASED(zero)         ((zero) + 1)
121 /* Convert a one-based value into a zero-based value */
122 #define ZERO_BASED(one)         ((one)  - 1)
123
124 /* Encode number of SQ's and CQ's for Set/Get Features */
125 #define NVME_FEATURE_NUM_QUEUES(sc) \
126         (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
127         (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
128
129 #define NVME_DOORBELL_OFFSET    offsetof(struct nvme_registers, doorbell)
130
131 enum nvme_controller_register_offsets {
132         NVME_CR_CAP_LOW = 0x00,
133         NVME_CR_CAP_HI  = 0x04,
134         NVME_CR_VS      = 0x08,
135         NVME_CR_INTMS   = 0x0c,
136         NVME_CR_INTMC   = 0x10,
137         NVME_CR_CC      = 0x14,
138         NVME_CR_CSTS    = 0x1c,
139         NVME_CR_NSSR    = 0x20,
140         NVME_CR_AQA     = 0x24,
141         NVME_CR_ASQ_LOW = 0x28,
142         NVME_CR_ASQ_HI  = 0x2c,
143         NVME_CR_ACQ_LOW = 0x30,
144         NVME_CR_ACQ_HI  = 0x34,
145 };
146
147 enum nvme_cmd_cdw11 {
148         NVME_CMD_CDW11_PC  = 0x0001,
149         NVME_CMD_CDW11_IEN = 0x0002,
150         NVME_CMD_CDW11_IV  = 0xFFFF0000,
151 };
152
153 enum nvme_copy_dir {
154         NVME_COPY_TO_PRP,
155         NVME_COPY_FROM_PRP,
156 };
157
158 #define NVME_CQ_INTEN   0x01
159 #define NVME_CQ_INTCOAL 0x02
160
161 struct nvme_completion_queue {
162         struct nvme_completion *qbase;
163         pthread_mutex_t mtx;
164         uint32_t        size;
165         uint16_t        tail; /* nvme progress */
166         uint16_t        head; /* guest progress */
167         uint16_t        intr_vec;
168         uint32_t        intr_en;
169 };
170
171 struct nvme_submission_queue {
172         struct nvme_command *qbase;
173         pthread_mutex_t mtx;
174         uint32_t        size;
175         uint16_t        head; /* nvme progress */
176         uint16_t        tail; /* guest progress */
177         uint16_t        cqid; /* completion queue id */
178         int             qpriority;
179 };
180
181 enum nvme_storage_type {
182         NVME_STOR_BLOCKIF = 0,
183         NVME_STOR_RAM = 1,
184 };
185
186 struct pci_nvme_blockstore {
187         enum nvme_storage_type type;
188         void            *ctx;
189         uint64_t        size;
190         uint32_t        sectsz;
191         uint32_t        sectsz_bits;
192         uint64_t        eui64;
193         uint32_t        deallocate:1;
194 };
195
196 /*
197  * Calculate the number of additional page descriptors for guest IO requests
198  * based on the advertised Max Data Transfer (MDTS) and given the number of
199  * default iovec's in a struct blockif_req.
200  *
201  * Note the + 1 allows for the initial descriptor to not be page aligned.
202  */
203 #define MDTS_PAD_SIZE \
204         NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
205         NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
206         0
207
208 struct pci_nvme_ioreq {
209         struct pci_nvme_softc *sc;
210         STAILQ_ENTRY(pci_nvme_ioreq) link;
211         struct nvme_submission_queue *nvme_sq;
212         uint16_t        sqid;
213
214         /* command information */
215         uint16_t        opc;
216         uint16_t        cid;
217         uint32_t        nsid;
218
219         uint64_t        prev_gpaddr;
220         size_t          prev_size;
221
222         struct blockif_req io_req;
223
224         struct iovec    iovpadding[MDTS_PAD_SIZE];
225 };
226
227 enum nvme_dsm_type {
228         /* Dataset Management bit in ONCS reflects backing storage capability */
229         NVME_DATASET_MANAGEMENT_AUTO,
230         /* Unconditionally set Dataset Management bit in ONCS */
231         NVME_DATASET_MANAGEMENT_ENABLE,
232         /* Unconditionally clear Dataset Management bit in ONCS */
233         NVME_DATASET_MANAGEMENT_DISABLE,
234 };
235
236 struct pci_nvme_softc;
237 struct nvme_feature_obj;
238
239 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
240     struct nvme_feature_obj *,
241     struct nvme_command *,
242     struct nvme_completion *);
243
244 struct nvme_feature_obj {
245         uint32_t        cdw11;
246         nvme_feature_cb set;
247         nvme_feature_cb get;
248         bool namespace_specific;
249 };
250
251 #define NVME_FID_MAX            (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
252
253 struct pci_nvme_softc {
254         struct pci_devinst *nsc_pi;
255
256         pthread_mutex_t mtx;
257
258         struct nvme_registers regs;
259
260         struct nvme_namespace_data  nsdata;
261         struct nvme_controller_data ctrldata;
262         struct nvme_error_information_entry err_log;
263         struct nvme_health_information_page health_log;
264         struct nvme_firmware_page fw_log;
265
266         struct pci_nvme_blockstore nvstore;
267
268         uint16_t        max_qentries;   /* max entries per queue */
269         uint32_t        max_queues;     /* max number of IO SQ's or CQ's */
270         uint32_t        num_cqueues;
271         uint32_t        num_squeues;
272         bool            num_q_is_set; /* Has host set Number of Queues */
273
274         struct pci_nvme_ioreq *ioreqs;
275         STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
276         uint32_t        pending_ios;
277         uint32_t        ioslots;
278         sem_t           iosemlock;
279
280         /*
281          * Memory mapped Submission and Completion queues
282          * Each array includes both Admin and IO queues
283          */
284         struct nvme_completion_queue *compl_queues;
285         struct nvme_submission_queue *submit_queues;
286
287         struct nvme_feature_obj feat[NVME_FID_MAX];
288
289         enum nvme_dsm_type dataset_management;
290 };
291
292
293 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
294 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
295 static void pci_nvme_io_done(struct blockif_req *, int);
296
297 /* Controller Configuration utils */
298 #define NVME_CC_GET_EN(cc) \
299         ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
300 #define NVME_CC_GET_CSS(cc) \
301         ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
302 #define NVME_CC_GET_SHN(cc) \
303         ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
304 #define NVME_CC_GET_IOSQES(cc) \
305         ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
306 #define NVME_CC_GET_IOCQES(cc) \
307         ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
308
309 #define NVME_CC_WRITE_MASK \
310         ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
311          (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
312          (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
313
314 #define NVME_CC_NEN_WRITE_MASK \
315         ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
316          (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
317          (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
318
319 /* Controller Status utils */
320 #define NVME_CSTS_GET_RDY(sts) \
321         ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
322
323 #define NVME_CSTS_RDY   (1 << NVME_CSTS_REG_RDY_SHIFT)
324
325 /* Completion Queue status word utils */
326 #define NVME_STATUS_P   (1 << NVME_STATUS_P_SHIFT)
327 #define NVME_STATUS_MASK \
328         ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
329          (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
330
331 #define NVME_ONCS_DSM   (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
332         NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
333
334 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
335     struct nvme_feature_obj *,
336     struct nvme_command *,
337     struct nvme_completion *);
338 static void nvme_feature_num_queues(struct pci_nvme_softc *,
339     struct nvme_feature_obj *,
340     struct nvme_command *,
341     struct nvme_completion *);
342
343 static __inline void
344 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
345 {
346         size_t len;
347
348         len = strnlen(src, dst_size);
349         memset(dst, pad, dst_size);
350         memcpy(dst, src, len);
351 }
352
353 static __inline void
354 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
355 {
356
357         *status &= ~NVME_STATUS_MASK;
358         *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
359                 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
360 }
361
362 static __inline void
363 pci_nvme_status_genc(uint16_t *status, uint16_t code)
364 {
365
366         pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
367 }
368
369 /*
370  * Initialize the requested number or IO Submission and Completion Queues.
371  * Admin queues are allocated implicitly.
372  */
373 static void
374 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
375 {
376         uint32_t i;
377
378         /*
379          * Allocate and initialize the Submission Queues
380          */
381         if (nsq > NVME_QUEUES) {
382                 WPRINTF("%s: clamping number of SQ from %u to %u",
383                                         __func__, nsq, NVME_QUEUES);
384                 nsq = NVME_QUEUES;
385         }
386
387         sc->num_squeues = nsq;
388
389         sc->submit_queues = calloc(sc->num_squeues + 1,
390                                 sizeof(struct nvme_submission_queue));
391         if (sc->submit_queues == NULL) {
392                 WPRINTF("%s: SQ allocation failed", __func__);
393                 sc->num_squeues = 0;
394         } else {
395                 struct nvme_submission_queue *sq = sc->submit_queues;
396
397                 for (i = 0; i < sc->num_squeues; i++)
398                         pthread_mutex_init(&sq[i].mtx, NULL);
399         }
400
401         /*
402          * Allocate and initialize the Completion Queues
403          */
404         if (ncq > NVME_QUEUES) {
405                 WPRINTF("%s: clamping number of CQ from %u to %u",
406                                         __func__, ncq, NVME_QUEUES);
407                 ncq = NVME_QUEUES;
408         }
409
410         sc->num_cqueues = ncq;
411
412         sc->compl_queues = calloc(sc->num_cqueues + 1,
413                                 sizeof(struct nvme_completion_queue));
414         if (sc->compl_queues == NULL) {
415                 WPRINTF("%s: CQ allocation failed", __func__);
416                 sc->num_cqueues = 0;
417         } else {
418                 struct nvme_completion_queue *cq = sc->compl_queues;
419
420                 for (i = 0; i < sc->num_cqueues; i++)
421                         pthread_mutex_init(&cq[i].mtx, NULL);
422         }
423 }
424
425 static void
426 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
427 {
428         struct nvme_controller_data *cd = &sc->ctrldata;
429
430         cd->vid = 0xFB5D;
431         cd->ssvid = 0x0000;
432
433         cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
434         cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
435
436         /* Num of submission commands that we can handle at a time (2^rab) */
437         cd->rab   = 4;
438
439         /* FreeBSD OUI */
440         cd->ieee[0] = 0x58;
441         cd->ieee[1] = 0x9c;
442         cd->ieee[2] = 0xfc;
443
444         cd->mic = 0;
445
446         cd->mdts = NVME_MDTS;   /* max data transfer size (2^mdts * CAP.MPSMIN) */
447
448         cd->ver = 0x00010300;
449
450         cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
451         cd->acl = 2;
452         cd->aerl = 4;
453
454         cd->lpa = 0;    /* TODO: support some simple things like SMART */
455         cd->elpe = 0;   /* max error log page entries */
456         cd->npss = 1;   /* number of power states support */
457
458         /* Warning Composite Temperature Threshold */
459         cd->wctemp = 0x0157;
460
461         cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
462             (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
463         cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
464             (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
465         cd->nn = 1;     /* number of namespaces */
466
467         cd->oncs = 0;
468         switch (sc->dataset_management) {
469         case NVME_DATASET_MANAGEMENT_AUTO:
470                 if (sc->nvstore.deallocate)
471                         cd->oncs |= NVME_ONCS_DSM;
472                 break;
473         case NVME_DATASET_MANAGEMENT_ENABLE:
474                 cd->oncs |= NVME_ONCS_DSM;
475                 break;
476         default:
477                 break;
478         }
479
480         cd->fna = 0x03;
481
482         cd->power_state[0].mp = 10;
483 }
484
485 /*
486  * Calculate the CRC-16 of the given buffer
487  * See copyright attribution at top of file
488  */
489 static uint16_t
490 crc16(uint16_t crc, const void *buffer, unsigned int len)
491 {
492         const unsigned char *cp = buffer;
493         /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
494         static uint16_t const crc16_table[256] = {
495                 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
496                 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
497                 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
498                 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
499                 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
500                 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
501                 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
502                 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
503                 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
504                 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
505                 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
506                 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
507                 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
508                 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
509                 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
510                 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
511                 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
512                 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
513                 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
514                 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
515                 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
516                 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
517                 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
518                 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
519                 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
520                 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
521                 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
522                 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
523                 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
524                 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
525                 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
526                 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
527         };
528
529         while (len--)
530                 crc = (((crc >> 8) & 0xffU) ^
531                     crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
532         return crc;
533 }
534
535 static void
536 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
537     struct nvme_namespace_data *nd, uint32_t nsid,
538     struct pci_nvme_blockstore *nvstore)
539 {
540
541         /* Get capacity and block size information from backing store */
542         nd->nsze = nvstore->size / nvstore->sectsz;
543         nd->ncap = nd->nsze;
544         nd->nuse = nd->nsze;
545
546         if (nvstore->type == NVME_STOR_BLOCKIF)
547                 nvstore->deallocate = blockif_candelete(nvstore->ctx);
548
549         nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
550         nd->flbas = 0;
551
552         /* Create an EUI-64 if user did not provide one */
553         if (nvstore->eui64 == 0) {
554                 char *data = NULL;
555                 uint64_t eui64 = nvstore->eui64;
556
557                 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
558                     sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
559
560                 if (data != NULL) {
561                         eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
562                         free(data);
563                 }
564                 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
565         }
566         be64enc(nd->eui64, nvstore->eui64);
567
568         /* LBA data-sz = 2^lbads */
569         nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
570 }
571
572 static void
573 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
574 {
575
576         memset(&sc->err_log, 0, sizeof(sc->err_log));
577         memset(&sc->health_log, 0, sizeof(sc->health_log));
578         memset(&sc->fw_log, 0, sizeof(sc->fw_log));
579 }
580
581 static void
582 pci_nvme_init_features(struct pci_nvme_softc *sc)
583 {
584
585         sc->feat[0].set = nvme_feature_invalid_cb;
586         sc->feat[0].get = nvme_feature_invalid_cb;
587
588         sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
589         sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
590         sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
591 }
592
593 static void
594 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
595 {
596         uint32_t i;
597
598         DPRINTF("%s", __func__);
599
600         sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
601             (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
602             (60 << NVME_CAP_LO_REG_TO_SHIFT);
603
604         sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
605
606         sc->regs.vs = 0x00010300;       /* NVMe v1.3 */
607
608         sc->regs.cc = 0;
609         sc->regs.csts = 0;
610
611         assert(sc->submit_queues != NULL);
612
613         for (i = 0; i < sc->num_squeues + 1; i++) {
614                 sc->submit_queues[i].qbase = NULL;
615                 sc->submit_queues[i].size = 0;
616                 sc->submit_queues[i].cqid = 0;
617                 sc->submit_queues[i].tail = 0;
618                 sc->submit_queues[i].head = 0;
619         }
620
621         assert(sc->compl_queues != NULL);
622
623         for (i = 0; i < sc->num_cqueues + 1; i++) {
624                 sc->compl_queues[i].qbase = NULL;
625                 sc->compl_queues[i].size = 0;
626                 sc->compl_queues[i].tail = 0;
627                 sc->compl_queues[i].head = 0;
628         }
629
630         sc->num_q_is_set = false;
631 }
632
633 static void
634 pci_nvme_reset(struct pci_nvme_softc *sc)
635 {
636         pthread_mutex_lock(&sc->mtx);
637         pci_nvme_reset_locked(sc);
638         pthread_mutex_unlock(&sc->mtx);
639 }
640
641 static void
642 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
643 {
644         uint16_t acqs, asqs;
645
646         DPRINTF("%s", __func__);
647
648         asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
649         sc->submit_queues[0].size = asqs;
650         sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
651                     sizeof(struct nvme_command) * asqs);
652
653         DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
654                 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
655
656         acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 
657             NVME_AQA_REG_ACQS_MASK) + 1;
658         sc->compl_queues[0].size = acqs;
659         sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
660                  sizeof(struct nvme_completion) * acqs);
661         sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
662
663         DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
664                 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
665 }
666
667 static int
668 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
669         size_t len, enum nvme_copy_dir dir)
670 {
671         uint8_t *p;
672         size_t bytes;
673
674         if (len > (8 * 1024)) {
675                 return (-1);
676         }
677
678         /* Copy from the start of prp1 to the end of the physical page */
679         bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
680         bytes = MIN(bytes, len);
681
682         p = vm_map_gpa(ctx, prp1, bytes);
683         if (p == NULL) {
684                 return (-1);
685         }
686
687         if (dir == NVME_COPY_TO_PRP)
688                 memcpy(p, b, bytes);
689         else
690                 memcpy(b, p, bytes);
691
692         b += bytes;
693
694         len -= bytes;
695         if (len == 0) {
696                 return (0);
697         }
698
699         len = MIN(len, PAGE_SIZE);
700
701         p = vm_map_gpa(ctx, prp2, len);
702         if (p == NULL) {
703                 return (-1);
704         }
705
706         if (dir == NVME_COPY_TO_PRP)
707                 memcpy(p, b, len);
708         else
709                 memcpy(b, p, len);
710
711         return (0);
712 }
713
714 /*
715  * Write a Completion Queue Entry update
716  *
717  * Write the completion and update the doorbell value
718  */
719 static void
720 pci_nvme_cq_update(struct pci_nvme_softc *sc,
721                 struct nvme_completion_queue *cq,
722                 uint32_t cdw0,
723                 uint16_t cid,
724                 uint16_t sqid,
725                 uint16_t status)
726 {
727         struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
728         struct nvme_completion *cqe;
729
730         assert(cq->qbase != NULL);
731
732         pthread_mutex_lock(&cq->mtx);
733
734         cqe = &cq->qbase[cq->tail];
735
736         /* Flip the phase bit */
737         status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
738
739         cqe->cdw0 = cdw0;
740         cqe->sqhd = sq->head;
741         cqe->sqid = sqid;
742         cqe->cid = cid;
743         cqe->status = status;
744
745         cq->tail++;
746         if (cq->tail >= cq->size) {
747                 cq->tail = 0;
748         }
749
750         pthread_mutex_unlock(&cq->mtx);
751 }
752
753 static int
754 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
755         struct nvme_completion* compl)
756 {
757         uint16_t qid = command->cdw10 & 0xffff;
758
759         DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
760         if (qid == 0 || qid > sc->num_squeues ||
761             (sc->submit_queues[qid].qbase == NULL)) {
762                 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
763                         __func__, qid, sc->num_squeues);
764                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
765                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
766                 return (1);
767         }
768
769         sc->submit_queues[qid].qbase = NULL;
770         sc->submit_queues[qid].cqid = 0;
771         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
772         return (1);
773 }
774
775 static int
776 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
777         struct nvme_completion* compl)
778 {
779         if (command->cdw11 & NVME_CMD_CDW11_PC) {
780                 uint16_t qid = command->cdw10 & 0xffff;
781                 struct nvme_submission_queue *nsq;
782
783                 if ((qid == 0) || (qid > sc->num_squeues) ||
784                     (sc->submit_queues[qid].qbase != NULL)) {
785                         WPRINTF("%s queue index %u > num_squeues %u",
786                                 __func__, qid, sc->num_squeues);
787                         pci_nvme_status_tc(&compl->status,
788                             NVME_SCT_COMMAND_SPECIFIC,
789                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
790                         return (1);
791                 }
792
793                 nsq = &sc->submit_queues[qid];
794                 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
795                 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
796                 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
797                         /*
798                          * Queues must specify at least two entries
799                          * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
800                          * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
801                          */
802                         pci_nvme_status_tc(&compl->status,
803                             NVME_SCT_COMMAND_SPECIFIC,
804                             NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
805                         return (1);
806                 }
807
808                 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
809                 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
810                         pci_nvme_status_tc(&compl->status,
811                             NVME_SCT_COMMAND_SPECIFIC,
812                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
813                         return (1);
814                 }
815
816                 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
817                         pci_nvme_status_tc(&compl->status,
818                             NVME_SCT_COMMAND_SPECIFIC,
819                             NVME_SC_COMPLETION_QUEUE_INVALID);
820                         return (1);
821                 }
822
823                 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
824
825                 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
826                               sizeof(struct nvme_command) * (size_t)nsq->size);
827
828                 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
829                         qid, nsq->size, nsq->qbase, nsq->cqid);
830
831                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
832
833                 DPRINTF("%s completed creating IOSQ qid %u",
834                          __func__, qid);
835         } else {
836                 /* 
837                  * Guest sent non-cont submission queue request.
838                  * This setting is unsupported by this emulation.
839                  */
840                 WPRINTF("%s unsupported non-contig (list-based) "
841                          "create i/o submission queue", __func__);
842
843                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
844         }
845         return (1);
846 }
847
848 static int
849 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
850         struct nvme_completion* compl)
851 {
852         uint16_t qid = command->cdw10 & 0xffff;
853         uint16_t sqid;
854
855         DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
856         if (qid == 0 || qid > sc->num_cqueues ||
857             (sc->compl_queues[qid].qbase == NULL)) {
858                 WPRINTF("%s queue index %u / num_cqueues %u",
859                         __func__, qid, sc->num_cqueues);
860                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
861                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
862                 return (1);
863         }
864
865         /* Deleting an Active CQ is an error */
866         for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
867                 if (sc->submit_queues[sqid].cqid == qid) {
868                         pci_nvme_status_tc(&compl->status,
869                             NVME_SCT_COMMAND_SPECIFIC,
870                             NVME_SC_INVALID_QUEUE_DELETION);
871                         return (1);
872                 }
873
874         sc->compl_queues[qid].qbase = NULL;
875         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
876         return (1);
877 }
878
879 static int
880 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
881         struct nvme_completion* compl)
882 {
883         struct nvme_completion_queue *ncq;
884         uint16_t qid = command->cdw10 & 0xffff;
885
886         /* Only support Physically Contiguous queues */
887         if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
888                 WPRINTF("%s unsupported non-contig (list-based) "
889                          "create i/o completion queue",
890                          __func__);
891
892                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
893                 return (1);
894         }
895
896         if ((qid == 0) || (qid > sc->num_cqueues) ||
897             (sc->compl_queues[qid].qbase != NULL)) {
898                 WPRINTF("%s queue index %u > num_cqueues %u",
899                         __func__, qid, sc->num_cqueues);
900                 pci_nvme_status_tc(&compl->status,
901                     NVME_SCT_COMMAND_SPECIFIC,
902                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
903                 return (1);
904         }
905
906         ncq = &sc->compl_queues[qid];
907         ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
908         ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
909         if (ncq->intr_vec > (sc->max_queues + 1)) {
910                 pci_nvme_status_tc(&compl->status,
911                     NVME_SCT_COMMAND_SPECIFIC,
912                     NVME_SC_INVALID_INTERRUPT_VECTOR);
913                 return (1);
914         }
915
916         ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
917         if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
918                 /*
919                  * Queues must specify at least two entries
920                  * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
921                  * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
922                  */
923                 pci_nvme_status_tc(&compl->status,
924                     NVME_SCT_COMMAND_SPECIFIC,
925                     NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
926                 return (1);
927         }
928         ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
929                      command->prp1,
930                      sizeof(struct nvme_command) * (size_t)ncq->size);
931
932         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
933
934
935         return (1);
936 }
937
938 static int
939 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
940         struct nvme_completion* compl)
941 {
942         uint32_t logsize;
943         uint8_t logpage = command->cdw10 & 0xFF;
944
945         DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
946
947         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
948
949         /*
950          * Command specifies the number of dwords to return in fields NUMDU
951          * and NUMDL. This is a zero-based value.
952          */
953         logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
954         logsize *= sizeof(uint32_t);
955
956         switch (logpage) {
957         case NVME_LOG_ERROR:
958                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
959                     command->prp2, (uint8_t *)&sc->err_log,
960                     MIN(logsize, sizeof(sc->err_log)),
961                     NVME_COPY_TO_PRP);
962                 break;
963         case NVME_LOG_HEALTH_INFORMATION:
964                 /* TODO: present some smart info */
965                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
966                     command->prp2, (uint8_t *)&sc->health_log,
967                     MIN(logsize, sizeof(sc->health_log)),
968                     NVME_COPY_TO_PRP);
969                 break;
970         case NVME_LOG_FIRMWARE_SLOT:
971                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
972                     command->prp2, (uint8_t *)&sc->fw_log,
973                     MIN(logsize, sizeof(sc->fw_log)),
974                     NVME_COPY_TO_PRP);
975                 break;
976         default:
977                 DPRINTF("%s get log page %x command not supported",
978                         __func__, logpage);
979
980                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
981                     NVME_SC_INVALID_LOG_PAGE);
982         }
983
984         return (1);
985 }
986
987 static int
988 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
989         struct nvme_completion* compl)
990 {
991         void *dest;
992         uint16_t status;
993
994         DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
995                 command->cdw10 & 0xFF, command->nsid);
996
997         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
998
999         switch (command->cdw10 & 0xFF) {
1000         case 0x00: /* return Identify Namespace data structure */
1001                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1002                     command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1003                     NVME_COPY_TO_PRP);
1004                 break;
1005         case 0x01: /* return Identify Controller data structure */
1006                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1007                     command->prp2, (uint8_t *)&sc->ctrldata,
1008                     sizeof(sc->ctrldata),
1009                     NVME_COPY_TO_PRP);
1010                 break;
1011         case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1012                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1013                                   sizeof(uint32_t) * 1024);
1014                 ((uint32_t *)dest)[0] = 1;
1015                 ((uint32_t *)dest)[1] = 0;
1016                 break;
1017         case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1018                 if (command->nsid != 1) {
1019                         pci_nvme_status_genc(&status,
1020                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1021                         break;
1022                 }
1023                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1024                                   sizeof(uint32_t) * 1024);
1025                 /* All bytes after the descriptor shall be zero */
1026                 bzero(dest, sizeof(uint32_t) * 1024);
1027
1028                 /* Return NIDT=1 (i.e. EUI64) descriptor */
1029                 ((uint8_t *)dest)[0] = 1;
1030                 ((uint8_t *)dest)[1] = sizeof(uint64_t);
1031                 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1032                 break;
1033         default:
1034                 DPRINTF("%s unsupported identify command requested 0x%x",
1035                          __func__, command->cdw10 & 0xFF);
1036                 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1037                 break;
1038         }
1039
1040         compl->status = status;
1041         return (1);
1042 }
1043
1044 static const char *
1045 nvme_fid_to_name(uint8_t fid)
1046 {
1047         const char *name;
1048
1049         switch (fid) {
1050         case NVME_FEAT_ARBITRATION:
1051                 name = "Arbitration";
1052                 break;
1053         case NVME_FEAT_POWER_MANAGEMENT:
1054                 name = "Power Management";
1055                 break;
1056         case NVME_FEAT_LBA_RANGE_TYPE:
1057                 name = "LBA Range Type";
1058                 break;
1059         case NVME_FEAT_TEMPERATURE_THRESHOLD:
1060                 name = "Temperature Threshold";
1061                 break;
1062         case NVME_FEAT_ERROR_RECOVERY:
1063                 name = "Error Recovery";
1064                 break;
1065         case NVME_FEAT_VOLATILE_WRITE_CACHE:
1066                 name = "Volatile Write Cache";
1067                 break;
1068         case NVME_FEAT_NUMBER_OF_QUEUES:
1069                 name = "Number of Queues";
1070                 break;
1071         case NVME_FEAT_INTERRUPT_COALESCING:
1072                 name = "Interrupt Coalescing";
1073                 break;
1074         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1075                 name = "Interrupt Vector Configuration";
1076                 break;
1077         case NVME_FEAT_WRITE_ATOMICITY:
1078                 name = "Write Atomicity Normal";
1079                 break;
1080         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1081                 name = "Asynchronous Event Configuration";
1082                 break;
1083         case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1084                 name = "Autonomous Power State Transition";
1085                 break;
1086         case NVME_FEAT_HOST_MEMORY_BUFFER:
1087                 name = "Host Memory Buffer";
1088                 break;
1089         case NVME_FEAT_TIMESTAMP:
1090                 name = "Timestamp";
1091                 break;
1092         case NVME_FEAT_KEEP_ALIVE_TIMER:
1093                 name = "Keep Alive Timer";
1094                 break;
1095         case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1096                 name = "Host Controlled Thermal Management";
1097                 break;
1098         case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1099                 name = "Non-Operation Power State Config";
1100                 break;
1101         case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1102                 name = "Read Recovery Level Config";
1103                 break;
1104         case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1105                 name = "Predictable Latency Mode Config";
1106                 break;
1107         case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1108                 name = "Predictable Latency Mode Window";
1109                 break;
1110         case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1111                 name = "LBA Status Information Report Interval";
1112                 break;
1113         case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1114                 name = "Host Behavior Support";
1115                 break;
1116         case NVME_FEAT_SANITIZE_CONFIG:
1117                 name = "Sanitize Config";
1118                 break;
1119         case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1120                 name = "Endurance Group Event Configuration";
1121                 break;
1122         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1123                 name = "Software Progress Marker";
1124                 break;
1125         case NVME_FEAT_HOST_IDENTIFIER:
1126                 name = "Host Identifier";
1127                 break;
1128         case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1129                 name = "Reservation Notification Mask";
1130                 break;
1131         case NVME_FEAT_RESERVATION_PERSISTENCE:
1132                 name = "Reservation Persistence";
1133                 break;
1134         case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1135                 name = "Namespace Write Protection Config";
1136                 break;
1137         default:
1138                 name = "Unknown";
1139                 break;
1140         }
1141
1142         return (name);
1143 }
1144
1145 static void
1146 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1147     struct nvme_feature_obj *feat,
1148     struct nvme_command *command,
1149     struct nvme_completion *compl)
1150 {
1151
1152         pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1153 }
1154
1155 static void
1156 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1157     struct nvme_feature_obj *feat,
1158     struct nvme_command *command,
1159     struct nvme_completion *compl)
1160 {
1161         uint16_t nqr;   /* Number of Queues Requested */
1162
1163         if (sc->num_q_is_set) {
1164                 WPRINTF("%s: Number of Queues already set", __func__);
1165                 pci_nvme_status_genc(&compl->status,
1166                     NVME_SC_COMMAND_SEQUENCE_ERROR);
1167                 return;
1168         }
1169
1170         nqr = command->cdw11 & 0xFFFF;
1171         if (nqr == 0xffff) {
1172                 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1173                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1174                 return;
1175         }
1176
1177         sc->num_squeues = ONE_BASED(nqr);
1178         if (sc->num_squeues > sc->max_queues) {
1179                 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1180                                         sc->max_queues);
1181                 sc->num_squeues = sc->max_queues;
1182         }
1183
1184         nqr = (command->cdw11 >> 16) & 0xFFFF;
1185         if (nqr == 0xffff) {
1186                 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1187                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1188                 return;
1189         }
1190
1191         sc->num_cqueues = ONE_BASED(nqr);
1192         if (sc->num_cqueues > sc->max_queues) {
1193                 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1194                                         sc->max_queues);
1195                 sc->num_cqueues = sc->max_queues;
1196         }
1197
1198         /* Patch the command value which will be saved on callback's return */
1199         command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1200         compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1201
1202         sc->num_q_is_set = true;
1203 }
1204
1205 static int
1206 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1207         struct nvme_completion *compl)
1208 {
1209         struct nvme_feature_obj *feat;
1210         uint32_t nsid = command->nsid;
1211         uint8_t fid = command->cdw10 & 0xFF;
1212
1213         DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1214
1215         if (fid >= NVME_FID_MAX) {
1216                 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1217                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1218                 return (1);
1219         }
1220         feat = &sc->feat[fid];
1221
1222         if (!feat->namespace_specific &&
1223             !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1224                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1225                     NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1226                 return (1);
1227         }
1228
1229         compl->cdw0 = 0;
1230         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1231
1232         if (feat->set)
1233                 feat->set(sc, feat, command, compl);
1234
1235         if (compl->status == NVME_SC_SUCCESS)
1236                 feat->cdw11 = command->cdw11;
1237
1238         return (0);
1239 }
1240
1241 static int
1242 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1243         struct nvme_completion* compl)
1244 {
1245         struct nvme_feature_obj *feat;
1246         uint8_t fid = command->cdw10 & 0xFF;
1247
1248         DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1249
1250         if (fid >= NVME_FID_MAX) {
1251                 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1252                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1253                 return (1);
1254         }
1255
1256         compl->cdw0 = 0;
1257         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1258
1259         feat = &sc->feat[fid];
1260         if (feat->get) {
1261                 feat->get(sc, feat, command, compl);
1262         }
1263
1264         if (compl->status == NVME_SC_SUCCESS) {
1265                 compl->cdw0 = feat->cdw11;
1266         }
1267
1268         return (0);
1269 }
1270
1271 static int
1272 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1273         struct nvme_completion* compl)
1274 {
1275         uint8_t ses, lbaf, pi;
1276
1277         /* Only supports Secure Erase Setting - User Data Erase */
1278         ses = (command->cdw10 >> 9) & 0x7;
1279         if (ses > 0x1) {
1280                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1281                 return (1);
1282         }
1283
1284         /* Only supports a single LBA Format */
1285         lbaf = command->cdw10 & 0xf;
1286         if (lbaf != 0) {
1287                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1288                     NVME_SC_INVALID_FORMAT);
1289                 return (1);
1290         }
1291
1292         /* Doesn't support Protection Infomation */
1293         pi = (command->cdw10 >> 5) & 0x7;
1294         if (pi != 0) {
1295                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1296                 return (1);
1297         }
1298
1299         if (sc->nvstore.type == NVME_STOR_RAM) {
1300                 if (sc->nvstore.ctx)
1301                         free(sc->nvstore.ctx);
1302                 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1303                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1304         } else {
1305                 struct pci_nvme_ioreq *req;
1306                 int err;
1307
1308                 req = pci_nvme_get_ioreq(sc);
1309                 if (req == NULL) {
1310                         pci_nvme_status_genc(&compl->status,
1311                             NVME_SC_INTERNAL_DEVICE_ERROR);
1312                         WPRINTF("%s: unable to allocate IO req", __func__);
1313                         return (1);
1314                 }
1315                 req->nvme_sq = &sc->submit_queues[0];
1316                 req->sqid = 0;
1317                 req->opc = command->opc;
1318                 req->cid = command->cid;
1319                 req->nsid = command->nsid;
1320
1321                 req->io_req.br_offset = 0;
1322                 req->io_req.br_resid = sc->nvstore.size;
1323                 req->io_req.br_callback = pci_nvme_io_done;
1324
1325                 err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1326                 if (err) {
1327                         pci_nvme_status_genc(&compl->status,
1328                             NVME_SC_INTERNAL_DEVICE_ERROR);
1329                         pci_nvme_release_ioreq(sc, req);
1330                 }
1331         }
1332
1333         return (1);
1334 }
1335
1336 static int
1337 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1338         struct nvme_completion* compl)
1339 {
1340         DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1341                 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1342
1343         /* TODO: search for the command ID and abort it */
1344
1345         compl->cdw0 = 1;
1346         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1347         return (1);
1348 }
1349
1350 static int
1351 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1352         struct nvme_command* command, struct nvme_completion* compl)
1353 {
1354         DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1355
1356         /*
1357          * TODO: raise events when they happen based on the Set Features cmd.
1358          * These events happen async, so only set completion successful if
1359          * there is an event reflective of the request to get event.
1360          */
1361         pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1362             NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1363         return (0);
1364 }
1365
1366 static void
1367 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1368 {
1369         struct nvme_completion compl;
1370         struct nvme_command *cmd;
1371         struct nvme_submission_queue *sq;
1372         struct nvme_completion_queue *cq;
1373         uint16_t sqhead;
1374
1375         DPRINTF("%s index %u", __func__, (uint32_t)value);
1376
1377         sq = &sc->submit_queues[0];
1378         cq = &sc->compl_queues[0];
1379
1380         pthread_mutex_lock(&sq->mtx);
1381
1382         sqhead = sq->head;
1383         DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1384         
1385         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1386                 cmd = &(sq->qbase)[sqhead];
1387                 compl.cdw0 = 0;
1388                 compl.status = 0;
1389
1390                 switch (cmd->opc) {
1391                 case NVME_OPC_DELETE_IO_SQ:
1392                         DPRINTF("%s command DELETE_IO_SQ", __func__);
1393                         nvme_opc_delete_io_sq(sc, cmd, &compl);
1394                         break;
1395                 case NVME_OPC_CREATE_IO_SQ:
1396                         DPRINTF("%s command CREATE_IO_SQ", __func__);
1397                         nvme_opc_create_io_sq(sc, cmd, &compl);
1398                         break;
1399                 case NVME_OPC_DELETE_IO_CQ:
1400                         DPRINTF("%s command DELETE_IO_CQ", __func__);
1401                         nvme_opc_delete_io_cq(sc, cmd, &compl);
1402                         break;
1403                 case NVME_OPC_CREATE_IO_CQ:
1404                         DPRINTF("%s command CREATE_IO_CQ", __func__);
1405                         nvme_opc_create_io_cq(sc, cmd, &compl);
1406                         break;
1407                 case NVME_OPC_GET_LOG_PAGE:
1408                         DPRINTF("%s command GET_LOG_PAGE", __func__);
1409                         nvme_opc_get_log_page(sc, cmd, &compl);
1410                         break;
1411                 case NVME_OPC_IDENTIFY:
1412                         DPRINTF("%s command IDENTIFY", __func__);
1413                         nvme_opc_identify(sc, cmd, &compl);
1414                         break;
1415                 case NVME_OPC_ABORT:
1416                         DPRINTF("%s command ABORT", __func__);
1417                         nvme_opc_abort(sc, cmd, &compl);
1418                         break;
1419                 case NVME_OPC_SET_FEATURES:
1420                         DPRINTF("%s command SET_FEATURES", __func__);
1421                         nvme_opc_set_features(sc, cmd, &compl);
1422                         break;
1423                 case NVME_OPC_GET_FEATURES:
1424                         DPRINTF("%s command GET_FEATURES", __func__);
1425                         nvme_opc_get_features(sc, cmd, &compl);
1426                         break;
1427                 case NVME_OPC_ASYNC_EVENT_REQUEST:
1428                         DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1429                         /* XXX dont care, unhandled for now
1430                         nvme_opc_async_event_req(sc, cmd, &compl);
1431                         */
1432                         compl.status = NVME_NO_STATUS;
1433                         break;
1434                 case NVME_OPC_FORMAT_NVM:
1435                         DPRINTF("%s command FORMAT_NVM", __func__);
1436                         if ((sc->ctrldata.oacs &
1437                             (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1438                                 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1439                         }
1440                         compl.status = NVME_NO_STATUS;
1441                         nvme_opc_format_nvm(sc, cmd, &compl);
1442                         break;
1443                 default:
1444                         DPRINTF("0x%x command is not implemented",
1445                             cmd->opc);
1446                         pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1447                 }
1448                 sqhead = (sqhead + 1) % sq->size;
1449
1450                 if (NVME_COMPLETION_VALID(compl)) {
1451                         pci_nvme_cq_update(sc, &sc->compl_queues[0],
1452                             compl.cdw0,
1453                             cmd->cid,
1454                             0,          /* SQID */
1455                             compl.status);
1456                 }
1457         }
1458
1459         DPRINTF("setting sqhead %u", sqhead);
1460         sq->head = sqhead;
1461
1462         if (cq->head != cq->tail)
1463                 pci_generate_msix(sc->nsc_pi, 0);
1464
1465         pthread_mutex_unlock(&sq->mtx);
1466 }
1467
1468 static int
1469 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1470         uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1471 {
1472         int iovidx;
1473
1474         if (req == NULL)
1475                 return (-1);
1476
1477         if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
1478                 return (-1);
1479         }
1480
1481         /* concatenate contig block-iovs to minimize number of iovs */
1482         if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1483                 iovidx = req->io_req.br_iovcnt - 1;
1484
1485                 req->io_req.br_iov[iovidx].iov_base =
1486                     paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1487                                      req->prev_gpaddr, size);
1488
1489                 req->prev_size += size;
1490                 req->io_req.br_resid += size;
1491
1492                 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1493         } else {
1494                 iovidx = req->io_req.br_iovcnt;
1495                 if (iovidx == 0) {
1496                         req->io_req.br_offset = lba;
1497                         req->io_req.br_resid = 0;
1498                         req->io_req.br_param = req;
1499                 }
1500
1501                 req->io_req.br_iov[iovidx].iov_base =
1502                     paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1503                                      gpaddr, size);
1504
1505                 req->io_req.br_iov[iovidx].iov_len = size;
1506
1507                 req->prev_gpaddr = gpaddr;
1508                 req->prev_size = size;
1509                 req->io_req.br_resid += size;
1510
1511                 req->io_req.br_iovcnt++;
1512         }
1513
1514         return (0);
1515 }
1516
1517 static void
1518 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1519         struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1520         uint32_t cdw0, uint16_t status)
1521 {
1522         struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1523
1524         DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1525                  __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1526                  NVME_STATUS_GET_SC(status));
1527
1528         pci_nvme_cq_update(sc, cq,
1529             0,          /* CDW0 */
1530             cid,
1531             sqid,
1532             status);
1533
1534         if (cq->head != cq->tail) {
1535                 if (cq->intr_en & NVME_CQ_INTEN) {
1536                         pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1537                 } else {
1538                         DPRINTF("%s: CQ%u interrupt disabled",
1539                                                 __func__, sq->cqid);
1540                 }
1541         }
1542 }
1543
1544 static void
1545 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1546 {
1547         req->sc = NULL;
1548         req->nvme_sq = NULL;
1549         req->sqid = 0;
1550
1551         pthread_mutex_lock(&sc->mtx);
1552
1553         STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1554         sc->pending_ios--;
1555
1556         /* when no more IO pending, can set to ready if device reset/enabled */
1557         if (sc->pending_ios == 0 &&
1558             NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1559                 sc->regs.csts |= NVME_CSTS_RDY;
1560
1561         pthread_mutex_unlock(&sc->mtx);
1562
1563         sem_post(&sc->iosemlock);
1564 }
1565
1566 static struct pci_nvme_ioreq *
1567 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1568 {
1569         struct pci_nvme_ioreq *req = NULL;;
1570
1571         sem_wait(&sc->iosemlock);
1572         pthread_mutex_lock(&sc->mtx);
1573
1574         req = STAILQ_FIRST(&sc->ioreqs_free);
1575         assert(req != NULL);
1576         STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1577
1578         req->sc = sc;
1579
1580         sc->pending_ios++;
1581
1582         pthread_mutex_unlock(&sc->mtx);
1583
1584         req->io_req.br_iovcnt = 0;
1585         req->io_req.br_offset = 0;
1586         req->io_req.br_resid = 0;
1587         req->io_req.br_param = req;
1588         req->prev_gpaddr = 0;
1589         req->prev_size = 0;
1590
1591         return req;
1592 }
1593
1594 static void
1595 pci_nvme_io_done(struct blockif_req *br, int err)
1596 {
1597         struct pci_nvme_ioreq *req = br->br_param;
1598         struct nvme_submission_queue *sq = req->nvme_sq;
1599         uint16_t code, status;
1600
1601         DPRINTF("%s error %d %s", __func__, err, strerror(err));
1602
1603         /* TODO return correct error */
1604         code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1605         pci_nvme_status_genc(&status, code);
1606
1607         pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1608         pci_nvme_release_ioreq(req->sc, req);
1609 }
1610
1611 /*
1612  * Implements the Flush command. The specification states:
1613  *    If a volatile write cache is not present, Flush commands complete
1614  *    successfully and have no effect
1615  * in the description of the Volatile Write Cache (VWC) field of the Identify
1616  * Controller data. Therefore, set status to Success if the command is
1617  * not supported (i.e. RAM or as indicated by the blockif).
1618  */
1619 static bool
1620 nvme_opc_flush(struct pci_nvme_softc *sc,
1621     struct nvme_command *cmd,
1622     struct pci_nvme_blockstore *nvstore,
1623     struct pci_nvme_ioreq *req,
1624     uint16_t *status)
1625 {
1626         bool pending = false;
1627
1628         if (nvstore->type == NVME_STOR_RAM) {
1629                 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1630         } else {
1631                 int err;
1632
1633                 req->io_req.br_callback = pci_nvme_io_done;
1634
1635                 err = blockif_flush(nvstore->ctx, &req->io_req);
1636                 switch (err) {
1637                 case 0:
1638                         pending = true;
1639                         break;
1640                 case EOPNOTSUPP:
1641                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1642                         break;
1643                 default:
1644                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1645                 }
1646         }
1647
1648         return (pending);
1649 }
1650
1651 static uint16_t
1652 nvme_write_read_ram(struct pci_nvme_softc *sc,
1653     struct pci_nvme_blockstore *nvstore,
1654     uint64_t prp1, uint64_t prp2,
1655     size_t offset, uint64_t bytes,
1656     bool is_write)
1657 {
1658         uint8_t *buf = nvstore->ctx;
1659         enum nvme_copy_dir dir;
1660         uint16_t status;
1661
1662         if (is_write)
1663                 dir = NVME_COPY_TO_PRP;
1664         else
1665                 dir = NVME_COPY_FROM_PRP;
1666
1667         if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
1668             buf + offset, bytes, dir))
1669                 pci_nvme_status_genc(&status,
1670                     NVME_SC_DATA_TRANSFER_ERROR);
1671         else
1672                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1673
1674         return (status);
1675 }
1676
1677 static uint16_t
1678 nvme_write_read_blockif(struct pci_nvme_softc *sc,
1679     struct pci_nvme_blockstore *nvstore,
1680     struct pci_nvme_ioreq *req,
1681     uint64_t prp1, uint64_t prp2,
1682     size_t offset, uint64_t bytes,
1683     bool is_write)
1684 {
1685         uint64_t size;
1686         int err;
1687         uint16_t status = NVME_NO_STATUS;
1688
1689         size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
1690         if (pci_nvme_append_iov_req(sc, req, prp1,
1691             size, is_write, offset)) {
1692                 pci_nvme_status_genc(&status,
1693                     NVME_SC_DATA_TRANSFER_ERROR);
1694                 goto out;
1695         }
1696
1697         offset += size;
1698         bytes  -= size;
1699
1700         if (bytes == 0) {
1701                 ;
1702         } else if (bytes <= PAGE_SIZE) {
1703                 size = bytes;
1704                 if (pci_nvme_append_iov_req(sc, req, prp2,
1705                     size, is_write, offset)) {
1706                         pci_nvme_status_genc(&status,
1707                             NVME_SC_DATA_TRANSFER_ERROR);
1708                         goto out;
1709                 }
1710         } else {
1711                 void *vmctx = sc->nsc_pi->pi_vmctx;
1712                 uint64_t *prp_list = &prp2;
1713                 uint64_t *last = prp_list;
1714
1715                 /* PRP2 is pointer to a physical region page list */
1716                 while (bytes) {
1717                         /* Last entry in list points to the next list */
1718                         if (prp_list == last) {
1719                                 uint64_t prp = *prp_list;
1720
1721                                 prp_list = paddr_guest2host(vmctx, prp,
1722                                     PAGE_SIZE - (prp % PAGE_SIZE));
1723                                 last = prp_list + (NVME_PRP2_ITEMS - 1);
1724                         }
1725
1726                         size = MIN(bytes, PAGE_SIZE);
1727
1728                         if (pci_nvme_append_iov_req(sc, req, *prp_list,
1729                             size, is_write, offset)) {
1730                                 pci_nvme_status_genc(&status,
1731                                     NVME_SC_DATA_TRANSFER_ERROR);
1732                                 goto out;
1733                         }
1734
1735                         offset += size;
1736                         bytes  -= size;
1737
1738                         prp_list++;
1739                 }
1740         }
1741         req->io_req.br_callback = pci_nvme_io_done;
1742         if (is_write)
1743                 err = blockif_write(nvstore->ctx, &req->io_req);
1744         else
1745                 err = blockif_read(nvstore->ctx, &req->io_req);
1746
1747         if (err)
1748                 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
1749 out:
1750         return (status);
1751 }
1752
1753 static bool
1754 nvme_opc_write_read(struct pci_nvme_softc *sc,
1755     struct nvme_command *cmd,
1756     struct pci_nvme_blockstore *nvstore,
1757     struct pci_nvme_ioreq *req,
1758     uint16_t *status)
1759 {
1760         uint64_t lba, nblocks, bytes;
1761         size_t offset;
1762         bool is_write = cmd->opc == NVME_OPC_WRITE;
1763         bool pending = false;
1764
1765         lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1766         nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1767
1768         bytes  = nblocks * nvstore->sectsz;
1769         if (bytes > NVME_MAX_DATA_SIZE) {
1770                 WPRINTF("%s command would exceed MDTS", __func__);
1771                 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
1772                 goto out;
1773         }
1774
1775         offset = lba * nvstore->sectsz;
1776         if ((offset + bytes) > nvstore->size) {
1777                 WPRINTF("%s command would exceed LBA range", __func__);
1778                 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
1779                 goto out;
1780         }
1781
1782         req->io_req.br_offset = lba;
1783
1784         /* PRP bits 1:0 must be zero */
1785         cmd->prp1 &= ~0x3UL;
1786         cmd->prp2 &= ~0x3UL;
1787
1788         if (nvstore->type == NVME_STOR_RAM) {
1789                 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
1790                     cmd->prp2, offset, bytes, is_write);
1791         } else {
1792                 *status = nvme_write_read_blockif(sc, nvstore, req,
1793                     cmd->prp1, cmd->prp2, offset, bytes, is_write);
1794
1795                 if (*status == NVME_NO_STATUS)
1796                         pending = true;
1797         }
1798 out:
1799         return (pending);
1800 }
1801
1802 static void
1803 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
1804 {
1805         struct pci_nvme_ioreq *req = br->br_param;
1806         struct pci_nvme_softc *sc = req->sc;
1807         bool done = true;
1808         uint16_t status;
1809
1810         if (err) {
1811                 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
1812         } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
1813                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1814         } else {
1815                 struct iovec *iov = req->io_req.br_iov;
1816
1817                 req->prev_gpaddr++;
1818                 iov += req->prev_gpaddr;
1819
1820                 /* The iov_* values already include the sector size */
1821                 req->io_req.br_offset = (off_t)iov->iov_base;
1822                 req->io_req.br_resid = iov->iov_len;
1823                 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
1824                         pci_nvme_status_genc(&status,
1825                             NVME_SC_INTERNAL_DEVICE_ERROR);
1826                 } else
1827                         done = false;
1828         }
1829
1830         if (done) {
1831                 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
1832                     req->cid, 0, status);
1833                 pci_nvme_release_ioreq(sc, req);
1834         }
1835 }
1836
1837 static bool
1838 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
1839     struct nvme_command *cmd,
1840     struct pci_nvme_blockstore *nvstore,
1841     struct pci_nvme_ioreq *req,
1842     uint16_t *status)
1843 {
1844         int err;
1845         bool pending = false;
1846
1847         if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
1848                 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
1849                 goto out;
1850         }
1851
1852         if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
1853                 struct nvme_dsm_range *range;
1854                 uint32_t nr, r;
1855                 int sectsz = sc->nvstore.sectsz;
1856
1857                 /*
1858                  * DSM calls are advisory only, and compliant controllers
1859                  * may choose to take no actions (i.e. return Success).
1860                  */
1861                 if (!nvstore->deallocate) {
1862                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1863                         goto out;
1864                 }
1865
1866                 if (req == NULL) {
1867                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1868                         goto out;
1869                 }
1870
1871                 /* copy locally because a range entry could straddle PRPs */
1872                 range = calloc(1, NVME_MAX_DSM_TRIM);
1873                 if (range == NULL) {
1874                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1875                         goto out;
1876                 }
1877                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
1878                     (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
1879
1880                 /*
1881                  * If the request is for more than a single range, store
1882                  * the ranges in the br_iov. Optimize for the common case
1883                  * of a single range.
1884                  *
1885                  * Note that NVMe Number of Ranges is a zero based value
1886                  */
1887                 nr = cmd->cdw10 & 0xff;
1888
1889                 req->io_req.br_iovcnt = 0;
1890                 req->io_req.br_offset = range[0].starting_lba * sectsz;
1891                 req->io_req.br_resid = range[0].length * sectsz;
1892
1893                 if (nr == 0) {
1894                         req->io_req.br_callback = pci_nvme_io_done;
1895                 } else {
1896                         struct iovec *iov = req->io_req.br_iov;
1897
1898                         for (r = 0; r <= nr; r++) {
1899                                 iov[r].iov_base = (void *)(range[r].starting_lba * sectsz);
1900                                 iov[r].iov_len = range[r].length * sectsz;
1901                         }
1902                         req->io_req.br_callback = pci_nvme_dealloc_sm;
1903
1904                         /*
1905                          * Use prev_gpaddr to track the current entry and
1906                          * prev_size to track the number of entries
1907                          */
1908                         req->prev_gpaddr = 0;
1909                         req->prev_size = r;
1910                 }
1911
1912                 err = blockif_delete(nvstore->ctx, &req->io_req);
1913                 if (err)
1914                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1915                 else
1916                         pending = true;
1917
1918                 free(range);
1919         }
1920 out:
1921         return (pending);
1922 }
1923
1924 static void
1925 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1926 {
1927         struct nvme_submission_queue *sq;
1928         uint16_t status;
1929         uint16_t sqhead;
1930
1931         /* handle all submissions up to sq->tail index */
1932         sq = &sc->submit_queues[idx];
1933
1934         pthread_mutex_lock(&sq->mtx);
1935
1936         sqhead = sq->head;
1937         DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
1938                  idx, sqhead, sq->tail, sq->qbase);
1939
1940         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1941                 struct nvme_command *cmd;
1942                 struct pci_nvme_ioreq *req;
1943                 uint32_t nsid;
1944                 bool pending;
1945
1946                 pending = false;
1947                 req = NULL;
1948                 status = 0;
1949
1950                 cmd = &sq->qbase[sqhead];
1951                 sqhead = (sqhead + 1) % sq->size;
1952
1953                 nsid = le32toh(cmd->nsid);
1954                 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
1955                         pci_nvme_status_genc(&status,
1956                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1957                         status |=
1958                             NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
1959                         goto complete;
1960                 }
1961
1962                 req = pci_nvme_get_ioreq(sc);
1963                 if (req == NULL) {
1964                         pci_nvme_status_genc(&status,
1965                             NVME_SC_INTERNAL_DEVICE_ERROR);
1966                         WPRINTF("%s: unable to allocate IO req", __func__);
1967                         goto complete;
1968                 }
1969                 req->nvme_sq = sq;
1970                 req->sqid = idx;
1971                 req->opc = cmd->opc;
1972                 req->cid = cmd->cid;
1973                 req->nsid = cmd->nsid;
1974
1975                 switch (cmd->opc) {
1976                 case NVME_OPC_FLUSH:
1977                         pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
1978                             req, &status);
1979                         break;
1980                 case NVME_OPC_WRITE:
1981                 case NVME_OPC_READ:
1982                         pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
1983                             req, &status);
1984                         break;
1985                 case NVME_OPC_WRITE_ZEROES:
1986                         /* TODO: write zeroes
1987                         WPRINTF("%s write zeroes lba 0x%lx blocks %u",
1988                                 __func__, lba, cmd->cdw12 & 0xFFFF); */
1989                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1990                         break;
1991                 case NVME_OPC_DATASET_MANAGEMENT:
1992                         pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
1993                             req, &status);
1994                         break;
1995                 default:
1996                         WPRINTF("%s unhandled io command 0x%x",
1997                             __func__, cmd->opc);
1998                         pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
1999                 }
2000 complete:
2001                 if (!pending) {
2002                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2003                             status);
2004                         if (req != NULL)
2005                                 pci_nvme_release_ioreq(sc, req);
2006                 }
2007         }
2008
2009         sq->head = sqhead;
2010
2011         pthread_mutex_unlock(&sq->mtx);
2012 }
2013
2014 static void
2015 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2016         uint64_t idx, int is_sq, uint64_t value)
2017 {
2018         DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2019                 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2020
2021         if (is_sq) {
2022                 if (idx > sc->num_squeues) {
2023                         WPRINTF("%s queue index %lu overflow from "
2024                                  "guest (max %u)",
2025                                  __func__, idx, sc->num_squeues);
2026                         return;
2027                 }
2028
2029                 atomic_store_short(&sc->submit_queues[idx].tail,
2030                                    (uint16_t)value);
2031
2032                 if (idx == 0) {
2033                         pci_nvme_handle_admin_cmd(sc, value);
2034                 } else {
2035                         /* submission queue; handle new entries in SQ */
2036                         if (idx > sc->num_squeues) {
2037                                 WPRINTF("%s SQ index %lu overflow from "
2038                                          "guest (max %u)",
2039                                          __func__, idx, sc->num_squeues);
2040                                 return;
2041                         }
2042                         pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2043                 }
2044         } else {
2045                 if (idx > sc->num_cqueues) {
2046                         WPRINTF("%s queue index %lu overflow from "
2047                                  "guest (max %u)",
2048                                  __func__, idx, sc->num_cqueues);
2049                         return;
2050                 }
2051
2052                 atomic_store_short(&sc->compl_queues[idx].head,
2053                                 (uint16_t)value);
2054         }
2055 }
2056
2057 static void
2058 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2059 {
2060         const char *s = iswrite ? "WRITE" : "READ";
2061
2062         switch (offset) {
2063         case NVME_CR_CAP_LOW:
2064                 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2065                 break;
2066         case NVME_CR_CAP_HI:
2067                 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2068                 break;
2069         case NVME_CR_VS:
2070                 DPRINTF("%s %s NVME_CR_VS", func, s);
2071                 break;
2072         case NVME_CR_INTMS:
2073                 DPRINTF("%s %s NVME_CR_INTMS", func, s);
2074                 break;
2075         case NVME_CR_INTMC:
2076                 DPRINTF("%s %s NVME_CR_INTMC", func, s);
2077                 break;
2078         case NVME_CR_CC:
2079                 DPRINTF("%s %s NVME_CR_CC", func, s);
2080                 break;
2081         case NVME_CR_CSTS:
2082                 DPRINTF("%s %s NVME_CR_CSTS", func, s);
2083                 break;
2084         case NVME_CR_NSSR:
2085                 DPRINTF("%s %s NVME_CR_NSSR", func, s);
2086                 break;
2087         case NVME_CR_AQA:
2088                 DPRINTF("%s %s NVME_CR_AQA", func, s);
2089                 break;
2090         case NVME_CR_ASQ_LOW:
2091                 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2092                 break;
2093         case NVME_CR_ASQ_HI:
2094                 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2095                 break;
2096         case NVME_CR_ACQ_LOW:
2097                 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2098                 break;
2099         case NVME_CR_ACQ_HI:
2100                 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2101                 break;
2102         default:
2103                 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2104         }
2105
2106 }
2107
2108 static void
2109 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2110         uint64_t offset, int size, uint64_t value)
2111 {
2112         uint32_t ccreg;
2113
2114         if (offset >= NVME_DOORBELL_OFFSET) {
2115                 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2116                 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2117                 int is_sq = (belloffset % 8) < 4;
2118
2119                 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2120                         WPRINTF("guest attempted an overflow write offset "
2121                                  "0x%lx, val 0x%lx in %s",
2122                                  offset, value, __func__);
2123                         return;
2124                 }
2125
2126                 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2127                 return;
2128         }
2129
2130         DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2131                 offset, size, value);
2132
2133         if (size != 4) {
2134                 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2135                          "val 0x%lx) to bar0 in %s",
2136                          size, offset, value, __func__);
2137                 /* TODO: shutdown device */
2138                 return;
2139         }
2140
2141         pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2142
2143         pthread_mutex_lock(&sc->mtx);
2144
2145         switch (offset) {
2146         case NVME_CR_CAP_LOW:
2147         case NVME_CR_CAP_HI:
2148                 /* readonly */
2149                 break;
2150         case NVME_CR_VS:
2151                 /* readonly */
2152                 break;
2153         case NVME_CR_INTMS:
2154                 /* MSI-X, so ignore */
2155                 break;
2156         case NVME_CR_INTMC:
2157                 /* MSI-X, so ignore */
2158                 break;
2159         case NVME_CR_CC:
2160                 ccreg = (uint32_t)value;
2161
2162                 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2163                          "iocqes %u",
2164                         __func__,
2165                          NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2166                          NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2167                          NVME_CC_GET_IOCQES(ccreg));
2168
2169                 if (NVME_CC_GET_SHN(ccreg)) {
2170                         /* perform shutdown - flush out data to backend */
2171                         sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2172                             NVME_CSTS_REG_SHST_SHIFT);
2173                         sc->regs.csts |= NVME_SHST_COMPLETE <<
2174                             NVME_CSTS_REG_SHST_SHIFT;
2175                 }
2176                 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2177                         if (NVME_CC_GET_EN(ccreg) == 0)
2178                                 /* transition 1-> causes controller reset */
2179                                 pci_nvme_reset_locked(sc);
2180                         else
2181                                 pci_nvme_init_controller(ctx, sc);
2182                 }
2183
2184                 /* Insert the iocqes, iosqes and en bits from the write */
2185                 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2186                 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2187                 if (NVME_CC_GET_EN(ccreg) == 0) {
2188                         /* Insert the ams, mps and css bit fields */
2189                         sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2190                         sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2191                         sc->regs.csts &= ~NVME_CSTS_RDY;
2192                 } else if (sc->pending_ios == 0) {
2193                         sc->regs.csts |= NVME_CSTS_RDY;
2194                 }
2195                 break;
2196         case NVME_CR_CSTS:
2197                 break;
2198         case NVME_CR_NSSR:
2199                 /* ignore writes; don't support subsystem reset */
2200                 break;
2201         case NVME_CR_AQA:
2202                 sc->regs.aqa = (uint32_t)value;
2203                 break;
2204         case NVME_CR_ASQ_LOW:
2205                 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2206                                (0xFFFFF000 & value);
2207                 break;
2208         case NVME_CR_ASQ_HI:
2209                 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2210                                (value << 32);
2211                 break;
2212         case NVME_CR_ACQ_LOW:
2213                 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2214                                (0xFFFFF000 & value);
2215                 break;
2216         case NVME_CR_ACQ_HI:
2217                 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2218                                (value << 32);
2219                 break;
2220         default:
2221                 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2222                          __func__, offset, value, size);
2223         }
2224         pthread_mutex_unlock(&sc->mtx);
2225 }
2226
2227 static void
2228 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2229                 int baridx, uint64_t offset, int size, uint64_t value)
2230 {
2231         struct pci_nvme_softc* sc = pi->pi_arg;
2232
2233         if (baridx == pci_msix_table_bar(pi) ||
2234             baridx == pci_msix_pba_bar(pi)) {
2235                 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2236                          " value 0x%lx", baridx, offset, size, value);
2237
2238                 pci_emul_msix_twrite(pi, offset, size, value);
2239                 return;
2240         }
2241
2242         switch (baridx) {
2243         case 0:
2244                 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2245                 break;
2246
2247         default:
2248                 DPRINTF("%s unknown baridx %d, val 0x%lx",
2249                          __func__, baridx, value);
2250         }
2251 }
2252
2253 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2254         uint64_t offset, int size)
2255 {
2256         uint64_t value;
2257
2258         pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2259
2260         if (offset < NVME_DOORBELL_OFFSET) {
2261                 void *p = &(sc->regs);
2262                 pthread_mutex_lock(&sc->mtx);
2263                 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2264                 pthread_mutex_unlock(&sc->mtx);
2265         } else {
2266                 value = 0;
2267                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2268         }
2269
2270         switch (size) {
2271         case 1:
2272                 value &= 0xFF;
2273                 break;
2274         case 2:
2275                 value &= 0xFFFF;
2276                 break;
2277         case 4:
2278                 value &= 0xFFFFFFFF;
2279                 break;
2280         }
2281
2282         DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
2283                  offset, size, (uint32_t)value);
2284
2285         return (value);
2286 }
2287
2288
2289
2290 static uint64_t
2291 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2292     uint64_t offset, int size)
2293 {
2294         struct pci_nvme_softc* sc = pi->pi_arg;
2295
2296         if (baridx == pci_msix_table_bar(pi) ||
2297             baridx == pci_msix_pba_bar(pi)) {
2298                 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2299                         baridx, offset, size);
2300
2301                 return pci_emul_msix_tread(pi, offset, size);
2302         }
2303
2304         switch (baridx) {
2305         case 0:
2306                 return pci_nvme_read_bar_0(sc, offset, size);
2307
2308         default:
2309                 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2310         }
2311
2312         return (0);
2313 }
2314
2315
2316 static int
2317 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2318 {
2319         char bident[sizeof("XX:X:X")];
2320         char    *uopt, *xopts, *config;
2321         uint32_t sectsz;
2322         int optidx;
2323
2324         sc->max_queues = NVME_QUEUES;
2325         sc->max_qentries = NVME_MAX_QENTRIES;
2326         sc->ioslots = NVME_IOSLOTS;
2327         sc->num_squeues = sc->max_queues;
2328         sc->num_cqueues = sc->max_queues;
2329         sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2330         sectsz = 0;
2331
2332         uopt = strdup(opts);
2333         optidx = 0;
2334         snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2335                  "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2336         for (xopts = strtok(uopt, ",");
2337              xopts != NULL;
2338              xopts = strtok(NULL, ",")) {
2339
2340                 if ((config = strchr(xopts, '=')) != NULL)
2341                         *config++ = '\0';
2342
2343                 if (!strcmp("maxq", xopts)) {
2344                         sc->max_queues = atoi(config);
2345                 } else if (!strcmp("qsz", xopts)) {
2346                         sc->max_qentries = atoi(config);
2347                 } else if (!strcmp("ioslots", xopts)) {
2348                         sc->ioslots = atoi(config);
2349                 } else if (!strcmp("sectsz", xopts)) {
2350                         sectsz = atoi(config);
2351                 } else if (!strcmp("ser", xopts)) {
2352                         /*
2353                          * This field indicates the Product Serial Number in
2354                          * 7-bit ASCII, unused bytes should be space characters.
2355                          * Ref: NVMe v1.3c.
2356                          */
2357                         cpywithpad((char *)sc->ctrldata.sn,
2358                                    sizeof(sc->ctrldata.sn), config, ' ');
2359                 } else if (!strcmp("ram", xopts)) {
2360                         uint64_t sz = strtoull(&xopts[4], NULL, 10);
2361
2362                         sc->nvstore.type = NVME_STOR_RAM;
2363                         sc->nvstore.size = sz * 1024 * 1024;
2364                         sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2365                         sc->nvstore.sectsz = 4096;
2366                         sc->nvstore.sectsz_bits = 12;
2367                         if (sc->nvstore.ctx == NULL) {
2368                                 perror("Unable to allocate RAM");
2369                                 free(uopt);
2370                                 return (-1);
2371                         }
2372                 } else if (!strcmp("eui64", xopts)) {
2373                         sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2374                 } else if (!strcmp("dsm", xopts)) {
2375                         if (!strcmp("auto", config))
2376                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2377                         else if (!strcmp("enable", config))
2378                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2379                         else if (!strcmp("disable", config))
2380                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2381                 } else if (optidx == 0) {
2382                         snprintf(bident, sizeof(bident), "%d:%d",
2383                                  sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2384                         sc->nvstore.ctx = blockif_open(xopts, bident);
2385                         if (sc->nvstore.ctx == NULL) {
2386                                 perror("Could not open backing file");
2387                                 free(uopt);
2388                                 return (-1);
2389                         }
2390                         sc->nvstore.type = NVME_STOR_BLOCKIF;
2391                         sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2392                 } else {
2393                         EPRINTLN("Invalid option %s", xopts);
2394                         free(uopt);
2395                         return (-1);
2396                 }
2397
2398                 optidx++;
2399         }
2400         free(uopt);
2401
2402         if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2403                 EPRINTLN("backing store not specified");
2404                 return (-1);
2405         }
2406         if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2407                 sc->nvstore.sectsz = sectsz;
2408         else if (sc->nvstore.type != NVME_STOR_RAM)
2409                 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2410         for (sc->nvstore.sectsz_bits = 9;
2411              (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2412              sc->nvstore.sectsz_bits++);
2413
2414         if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2415                 sc->max_queues = NVME_QUEUES;
2416
2417         if (sc->max_qentries <= 0) {
2418                 EPRINTLN("Invalid qsz option");
2419                 return (-1);
2420         }
2421         if (sc->ioslots <= 0) {
2422                 EPRINTLN("Invalid ioslots option");
2423                 return (-1);
2424         }
2425
2426         return (0);
2427 }
2428
2429 static int
2430 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2431 {
2432         struct pci_nvme_softc *sc;
2433         uint32_t pci_membar_sz;
2434         int     error;
2435
2436         error = 0;
2437
2438         sc = calloc(1, sizeof(struct pci_nvme_softc));
2439         pi->pi_arg = sc;
2440         sc->nsc_pi = pi;
2441
2442         error = pci_nvme_parse_opts(sc, opts);
2443         if (error < 0)
2444                 goto done;
2445         else
2446                 error = 0;
2447
2448         STAILQ_INIT(&sc->ioreqs_free);
2449         sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2450         for (int i = 0; i < sc->ioslots; i++) {
2451                 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2452         }
2453
2454         pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2455         pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2456         pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2457         pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2458         pci_set_cfgdata8(pi, PCIR_PROGIF,
2459                          PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2460
2461         /*
2462          * Allocate size of NVMe registers + doorbell space for all queues.
2463          *
2464          * The specification requires a minimum memory I/O window size of 16K.
2465          * The Windows driver will refuse to start a device with a smaller
2466          * window.
2467          */
2468         pci_membar_sz = sizeof(struct nvme_registers) +
2469             2 * sizeof(uint32_t) * (sc->max_queues + 1);
2470         pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2471
2472         DPRINTF("nvme membar size: %u", pci_membar_sz);
2473
2474         error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2475         if (error) {
2476                 WPRINTF("%s pci alloc mem bar failed", __func__);
2477                 goto done;
2478         }
2479
2480         error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2481         if (error) {
2482                 WPRINTF("%s pci add msixcap failed", __func__);
2483                 goto done;
2484         }
2485
2486         error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2487         if (error) {
2488                 WPRINTF("%s pci add Express capability failed", __func__);
2489                 goto done;
2490         }
2491
2492         pthread_mutex_init(&sc->mtx, NULL);
2493         sem_init(&sc->iosemlock, 0, sc->ioslots);
2494
2495         pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2496         /*
2497          * Controller data depends on Namespace data so initialize Namespace
2498          * data first.
2499          */
2500         pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2501         pci_nvme_init_ctrldata(sc);
2502         pci_nvme_init_logpages(sc);
2503         pci_nvme_init_features(sc);
2504
2505         pci_nvme_reset(sc);
2506
2507         pci_lintr_request(pi);
2508
2509 done:
2510         return (error);
2511 }
2512
2513
2514 struct pci_devemu pci_de_nvme = {
2515         .pe_emu =       "nvme",
2516         .pe_init =      pci_nvme_init,
2517         .pe_barwrite =  pci_nvme_write,
2518         .pe_barread =   pci_nvme_read
2519 };
2520 PCI_EMUL_SET(pci_de_nvme);