]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/bhyve/pci_nvme.c
bhyve nvme: Add AEN support to NVMe emulation
[FreeBSD/FreeBSD.git] / usr.sbin / bhyve / pci_nvme.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov 
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65
66 #include <assert.h>
67 #include <pthread.h>
68 #include <pthread_np.h>
69 #include <semaphore.h>
70 #include <stdbool.h>
71 #include <stddef.h>
72 #include <stdint.h>
73 #include <stdio.h>
74 #include <stdlib.h>
75 #include <string.h>
76
77 #include <machine/atomic.h>
78 #include <machine/vmm.h>
79 #include <vmmapi.h>
80
81 #include <dev/nvme/nvme.h>
82
83 #include "bhyverun.h"
84 #include "block_if.h"
85 #include "config.h"
86 #include "debug.h"
87 #include "pci_emul.h"
88
89
90 static int nvme_debug = 0;
91 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
92 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
93
94 /* defaults; can be overridden */
95 #define NVME_MSIX_BAR           4
96
97 #define NVME_IOSLOTS            8
98
99 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
100 #define NVME_MMIO_SPACE_MIN     (1 << 14)
101
102 #define NVME_QUEUES             16
103 #define NVME_MAX_QENTRIES       2048
104 /* Memory Page size Minimum reported in CAP register */
105 #define NVME_MPSMIN             0
106 /* MPSMIN converted to bytes */
107 #define NVME_MPSMIN_BYTES       (1 << (12 + NVME_MPSMIN))
108
109 #define NVME_PRP2_ITEMS         (PAGE_SIZE/sizeof(uint64_t))
110 #define NVME_MDTS               9
111 /* Note the + 1 allows for the initial descriptor to not be page aligned */
112 #define NVME_MAX_IOVEC          ((1 << NVME_MDTS) + 1)
113 #define NVME_MAX_DATA_SIZE      ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
114
115 /* This is a synthetic status code to indicate there is no status */
116 #define NVME_NO_STATUS          0xffff
117 #define NVME_COMPLETION_VALID(c)        ((c).status != NVME_NO_STATUS)
118
119 /* helpers */
120
121 /* Convert a zero-based value into a one-based value */
122 #define ONE_BASED(zero)         ((zero) + 1)
123 /* Convert a one-based value into a zero-based value */
124 #define ZERO_BASED(one)         ((one)  - 1)
125
126 /* Encode number of SQ's and CQ's for Set/Get Features */
127 #define NVME_FEATURE_NUM_QUEUES(sc) \
128         (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
129         (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
130
131 #define NVME_DOORBELL_OFFSET    offsetof(struct nvme_registers, doorbell)
132
133 enum nvme_controller_register_offsets {
134         NVME_CR_CAP_LOW = 0x00,
135         NVME_CR_CAP_HI  = 0x04,
136         NVME_CR_VS      = 0x08,
137         NVME_CR_INTMS   = 0x0c,
138         NVME_CR_INTMC   = 0x10,
139         NVME_CR_CC      = 0x14,
140         NVME_CR_CSTS    = 0x1c,
141         NVME_CR_NSSR    = 0x20,
142         NVME_CR_AQA     = 0x24,
143         NVME_CR_ASQ_LOW = 0x28,
144         NVME_CR_ASQ_HI  = 0x2c,
145         NVME_CR_ACQ_LOW = 0x30,
146         NVME_CR_ACQ_HI  = 0x34,
147 };
148
149 enum nvme_cmd_cdw11 {
150         NVME_CMD_CDW11_PC  = 0x0001,
151         NVME_CMD_CDW11_IEN = 0x0002,
152         NVME_CMD_CDW11_IV  = 0xFFFF0000,
153 };
154
155 enum nvme_copy_dir {
156         NVME_COPY_TO_PRP,
157         NVME_COPY_FROM_PRP,
158 };
159
160 #define NVME_CQ_INTEN   0x01
161 #define NVME_CQ_INTCOAL 0x02
162
163 struct nvme_completion_queue {
164         struct nvme_completion *qbase;
165         pthread_mutex_t mtx;
166         uint32_t        size;
167         uint16_t        tail; /* nvme progress */
168         uint16_t        head; /* guest progress */
169         uint16_t        intr_vec;
170         uint32_t        intr_en;
171 };
172
173 struct nvme_submission_queue {
174         struct nvme_command *qbase;
175         pthread_mutex_t mtx;
176         uint32_t        size;
177         uint16_t        head; /* nvme progress */
178         uint16_t        tail; /* guest progress */
179         uint16_t        cqid; /* completion queue id */
180         int             qpriority;
181 };
182
183 enum nvme_storage_type {
184         NVME_STOR_BLOCKIF = 0,
185         NVME_STOR_RAM = 1,
186 };
187
188 struct pci_nvme_blockstore {
189         enum nvme_storage_type type;
190         void            *ctx;
191         uint64_t        size;
192         uint32_t        sectsz;
193         uint32_t        sectsz_bits;
194         uint64_t        eui64;
195         uint32_t        deallocate:1;
196 };
197
198 /*
199  * Calculate the number of additional page descriptors for guest IO requests
200  * based on the advertised Max Data Transfer (MDTS) and given the number of
201  * default iovec's in a struct blockif_req.
202  */
203 #define MDTS_PAD_SIZE \
204         ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
205           NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
206           0 )
207
208 struct pci_nvme_ioreq {
209         struct pci_nvme_softc *sc;
210         STAILQ_ENTRY(pci_nvme_ioreq) link;
211         struct nvme_submission_queue *nvme_sq;
212         uint16_t        sqid;
213
214         /* command information */
215         uint16_t        opc;
216         uint16_t        cid;
217         uint32_t        nsid;
218
219         uint64_t        prev_gpaddr;
220         size_t          prev_size;
221         size_t          bytes;
222
223         struct blockif_req io_req;
224
225         struct iovec    iovpadding[MDTS_PAD_SIZE];
226 };
227
228 enum nvme_dsm_type {
229         /* Dataset Management bit in ONCS reflects backing storage capability */
230         NVME_DATASET_MANAGEMENT_AUTO,
231         /* Unconditionally set Dataset Management bit in ONCS */
232         NVME_DATASET_MANAGEMENT_ENABLE,
233         /* Unconditionally clear Dataset Management bit in ONCS */
234         NVME_DATASET_MANAGEMENT_DISABLE,
235 };
236
237 struct pci_nvme_softc;
238 struct nvme_feature_obj;
239
240 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
241     struct nvme_feature_obj *,
242     struct nvme_command *,
243     struct nvme_completion *);
244
245 struct nvme_feature_obj {
246         uint32_t        cdw11;
247         nvme_feature_cb set;
248         nvme_feature_cb get;
249         bool namespace_specific;
250 };
251
252 #define NVME_FID_MAX            (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
253
254 typedef enum {
255         PCI_NVME_AE_TYPE_ERROR = 0,
256         PCI_NVME_AE_TYPE_SMART,
257         PCI_NVME_AE_TYPE_NOTICE,
258         PCI_NVME_AE_TYPE_IO_CMD = 6,
259         PCI_NVME_AE_TYPE_VENDOR = 7,
260         PCI_NVME_AE_TYPE_MAX            /* Must be last */
261 } pci_nvme_async_type;
262
263 /* Asynchronous Event Requests */
264 struct pci_nvme_aer {
265         STAILQ_ENTRY(pci_nvme_aer) link;
266         uint16_t        cid;    /* Command ID of the submitted AER */
267 };
268
269 typedef enum {
270         PCI_NVME_AE_INFO_NS_ATTR_CHANGED = 0,
271         PCI_NVME_AE_INFO_FW_ACTIVATION,
272         PCI_NVME_AE_INFO_TELEMETRY_CHANGE,
273         PCI_NVME_AE_INFO_ANA_CHANGE,
274         PCI_NVME_AE_INFO_PREDICT_LATENCY_CHANGE,
275         PCI_NVME_AE_INFO_LBA_STATUS_ALERT,
276         PCI_NVME_AE_INFO_ENDURANCE_GROUP_CHANGE,
277         PCI_NVME_AE_INFO_MAX,
278 } pci_nvme_async_info;
279
280 /* Asynchronous Event Notifications */
281 struct pci_nvme_aen {
282         pci_nvme_async_type atype;
283         uint32_t        event_data;
284         bool            posted;
285 };
286
287 struct pci_nvme_softc {
288         struct pci_devinst *nsc_pi;
289
290         pthread_mutex_t mtx;
291
292         struct nvme_registers regs;
293
294         struct nvme_namespace_data  nsdata;
295         struct nvme_controller_data ctrldata;
296         struct nvme_error_information_entry err_log;
297         struct nvme_health_information_page health_log;
298         struct nvme_firmware_page fw_log;
299
300         struct pci_nvme_blockstore nvstore;
301
302         uint16_t        max_qentries;   /* max entries per queue */
303         uint32_t        max_queues;     /* max number of IO SQ's or CQ's */
304         uint32_t        num_cqueues;
305         uint32_t        num_squeues;
306         bool            num_q_is_set; /* Has host set Number of Queues */
307
308         struct pci_nvme_ioreq *ioreqs;
309         STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
310         uint32_t        pending_ios;
311         uint32_t        ioslots;
312         sem_t           iosemlock;
313
314         /*
315          * Memory mapped Submission and Completion queues
316          * Each array includes both Admin and IO queues
317          */
318         struct nvme_completion_queue *compl_queues;
319         struct nvme_submission_queue *submit_queues;
320
321         struct nvme_feature_obj feat[NVME_FID_MAX];
322
323         enum nvme_dsm_type dataset_management;
324
325         /* Accounting for SMART data */
326         __uint128_t     read_data_units;
327         __uint128_t     write_data_units;
328         __uint128_t     read_commands;
329         __uint128_t     write_commands;
330         uint32_t        read_dunits_remainder;
331         uint32_t        write_dunits_remainder;
332
333         STAILQ_HEAD(, pci_nvme_aer) aer_list;
334         pthread_mutex_t aer_mtx;
335         uint32_t        aer_count;
336         struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX];
337         pthread_t       aen_tid;
338         pthread_mutex_t aen_mtx;
339         pthread_cond_t  aen_cond;
340 };
341
342
343 static void pci_nvme_cq_update(struct pci_nvme_softc *sc,
344     struct nvme_completion_queue *cq,
345     uint32_t cdw0,
346     uint16_t cid,
347     uint16_t sqid,
348     uint16_t status);
349 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
350 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
351 static void pci_nvme_io_done(struct blockif_req *, int);
352
353 /* Controller Configuration utils */
354 #define NVME_CC_GET_EN(cc) \
355         ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
356 #define NVME_CC_GET_CSS(cc) \
357         ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
358 #define NVME_CC_GET_SHN(cc) \
359         ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
360 #define NVME_CC_GET_IOSQES(cc) \
361         ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
362 #define NVME_CC_GET_IOCQES(cc) \
363         ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
364
365 #define NVME_CC_WRITE_MASK \
366         ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
367          (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
368          (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
369
370 #define NVME_CC_NEN_WRITE_MASK \
371         ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
372          (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
373          (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
374
375 /* Controller Status utils */
376 #define NVME_CSTS_GET_RDY(sts) \
377         ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
378
379 #define NVME_CSTS_RDY   (1 << NVME_CSTS_REG_RDY_SHIFT)
380
381 /* Completion Queue status word utils */
382 #define NVME_STATUS_P   (1 << NVME_STATUS_P_SHIFT)
383 #define NVME_STATUS_MASK \
384         ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
385          (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
386
387 #define NVME_ONCS_DSM   (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
388         NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
389
390 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
391     struct nvme_feature_obj *,
392     struct nvme_command *,
393     struct nvme_completion *);
394 static void nvme_feature_num_queues(struct pci_nvme_softc *,
395     struct nvme_feature_obj *,
396     struct nvme_command *,
397     struct nvme_completion *);
398 static void nvme_feature_iv_config(struct pci_nvme_softc *,
399     struct nvme_feature_obj *,
400     struct nvme_command *,
401     struct nvme_completion *);
402
403 static void *aen_thr(void *arg);
404
405 static __inline void
406 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
407 {
408         size_t len;
409
410         len = strnlen(src, dst_size);
411         memset(dst, pad, dst_size);
412         memcpy(dst, src, len);
413 }
414
415 static __inline void
416 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
417 {
418
419         *status &= ~NVME_STATUS_MASK;
420         *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
421                 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
422 }
423
424 static __inline void
425 pci_nvme_status_genc(uint16_t *status, uint16_t code)
426 {
427
428         pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
429 }
430
431 /*
432  * Initialize the requested number or IO Submission and Completion Queues.
433  * Admin queues are allocated implicitly.
434  */
435 static void
436 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
437 {
438         uint32_t i;
439
440         /*
441          * Allocate and initialize the Submission Queues
442          */
443         if (nsq > NVME_QUEUES) {
444                 WPRINTF("%s: clamping number of SQ from %u to %u",
445                                         __func__, nsq, NVME_QUEUES);
446                 nsq = NVME_QUEUES;
447         }
448
449         sc->num_squeues = nsq;
450
451         sc->submit_queues = calloc(sc->num_squeues + 1,
452                                 sizeof(struct nvme_submission_queue));
453         if (sc->submit_queues == NULL) {
454                 WPRINTF("%s: SQ allocation failed", __func__);
455                 sc->num_squeues = 0;
456         } else {
457                 struct nvme_submission_queue *sq = sc->submit_queues;
458
459                 for (i = 0; i < sc->num_squeues; i++)
460                         pthread_mutex_init(&sq[i].mtx, NULL);
461         }
462
463         /*
464          * Allocate and initialize the Completion Queues
465          */
466         if (ncq > NVME_QUEUES) {
467                 WPRINTF("%s: clamping number of CQ from %u to %u",
468                                         __func__, ncq, NVME_QUEUES);
469                 ncq = NVME_QUEUES;
470         }
471
472         sc->num_cqueues = ncq;
473
474         sc->compl_queues = calloc(sc->num_cqueues + 1,
475                                 sizeof(struct nvme_completion_queue));
476         if (sc->compl_queues == NULL) {
477                 WPRINTF("%s: CQ allocation failed", __func__);
478                 sc->num_cqueues = 0;
479         } else {
480                 struct nvme_completion_queue *cq = sc->compl_queues;
481
482                 for (i = 0; i < sc->num_cqueues; i++)
483                         pthread_mutex_init(&cq[i].mtx, NULL);
484         }
485 }
486
487 static void
488 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
489 {
490         struct nvme_controller_data *cd = &sc->ctrldata;
491
492         cd->vid = 0xFB5D;
493         cd->ssvid = 0x0000;
494
495         cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
496         cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
497
498         /* Num of submission commands that we can handle at a time (2^rab) */
499         cd->rab   = 4;
500
501         /* FreeBSD OUI */
502         cd->ieee[0] = 0x58;
503         cd->ieee[1] = 0x9c;
504         cd->ieee[2] = 0xfc;
505
506         cd->mic = 0;
507
508         cd->mdts = NVME_MDTS;   /* max data transfer size (2^mdts * CAP.MPSMIN) */
509
510         cd->ver = 0x00010300;
511
512         cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
513         cd->acl = 2;
514         cd->aerl = 4;
515
516         /* Advertise 1, Read-only firmware slot */
517         cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
518             (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
519         cd->lpa = 0;    /* TODO: support some simple things like SMART */
520         cd->elpe = 0;   /* max error log page entries */
521         cd->npss = 1;   /* number of power states support */
522
523         /* Warning Composite Temperature Threshold */
524         cd->wctemp = 0x0157;
525
526         cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
527             (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
528         cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
529             (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
530         cd->nn = 1;     /* number of namespaces */
531
532         cd->oncs = 0;
533         switch (sc->dataset_management) {
534         case NVME_DATASET_MANAGEMENT_AUTO:
535                 if (sc->nvstore.deallocate)
536                         cd->oncs |= NVME_ONCS_DSM;
537                 break;
538         case NVME_DATASET_MANAGEMENT_ENABLE:
539                 cd->oncs |= NVME_ONCS_DSM;
540                 break;
541         default:
542                 break;
543         }
544
545         cd->fna = 0x03;
546
547         cd->power_state[0].mp = 10;
548 }
549
550 /*
551  * Calculate the CRC-16 of the given buffer
552  * See copyright attribution at top of file
553  */
554 static uint16_t
555 crc16(uint16_t crc, const void *buffer, unsigned int len)
556 {
557         const unsigned char *cp = buffer;
558         /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
559         static uint16_t const crc16_table[256] = {
560                 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
561                 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
562                 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
563                 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
564                 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
565                 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
566                 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
567                 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
568                 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
569                 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
570                 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
571                 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
572                 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
573                 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
574                 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
575                 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
576                 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
577                 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
578                 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
579                 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
580                 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
581                 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
582                 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
583                 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
584                 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
585                 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
586                 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
587                 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
588                 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
589                 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
590                 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
591                 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
592         };
593
594         while (len--)
595                 crc = (((crc >> 8) & 0xffU) ^
596                     crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
597         return crc;
598 }
599
600 static void
601 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
602     struct nvme_namespace_data *nd, uint32_t nsid,
603     struct pci_nvme_blockstore *nvstore)
604 {
605
606         /* Get capacity and block size information from backing store */
607         nd->nsze = nvstore->size / nvstore->sectsz;
608         nd->ncap = nd->nsze;
609         nd->nuse = nd->nsze;
610
611         if (nvstore->type == NVME_STOR_BLOCKIF)
612                 nvstore->deallocate = blockif_candelete(nvstore->ctx);
613
614         nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
615         nd->flbas = 0;
616
617         /* Create an EUI-64 if user did not provide one */
618         if (nvstore->eui64 == 0) {
619                 char *data = NULL;
620                 uint64_t eui64 = nvstore->eui64;
621
622                 asprintf(&data, "%s%u%u%u", get_config_value("name"),
623                     sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot,
624                     sc->nsc_pi->pi_func);
625
626                 if (data != NULL) {
627                         eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
628                         free(data);
629                 }
630                 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
631         }
632         be64enc(nd->eui64, nvstore->eui64);
633
634         /* LBA data-sz = 2^lbads */
635         nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
636 }
637
638 static void
639 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
640 {
641
642         memset(&sc->err_log, 0, sizeof(sc->err_log));
643         memset(&sc->health_log, 0, sizeof(sc->health_log));
644         memset(&sc->fw_log, 0, sizeof(sc->fw_log));
645
646         /* Set read/write remainder to round up according to spec */
647         sc->read_dunits_remainder = 999;
648         sc->write_dunits_remainder = 999;
649
650         /* Set nominal Health values checked by implementations */
651         sc->health_log.temperature = 310;
652         sc->health_log.available_spare = 100;
653         sc->health_log.available_spare_threshold = 10;
654 }
655
656 static void
657 pci_nvme_init_features(struct pci_nvme_softc *sc)
658 {
659
660         sc->feat[0].set = nvme_feature_invalid_cb;
661         sc->feat[0].get = nvme_feature_invalid_cb;
662
663         sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
664         sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
665         sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
666         sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set =
667             nvme_feature_iv_config;
668         /* Enable all AENs by default */
669         sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 = 0x31f;
670         sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get =
671             nvme_feature_invalid_cb;
672         sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get =
673             nvme_feature_invalid_cb;
674 }
675
676 static void
677 pci_nvme_aer_reset(struct pci_nvme_softc *sc)
678 {
679
680         STAILQ_INIT(&sc->aer_list);
681         sc->aer_count = 0;
682 }
683
684 static void
685 pci_nvme_aer_init(struct pci_nvme_softc *sc)
686 {
687
688         pthread_mutex_init(&sc->aer_mtx, NULL);
689         pci_nvme_aer_reset(sc);
690 }
691
692 static void
693 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
694 {
695         struct pci_nvme_aer *aer = NULL;
696
697         pthread_mutex_lock(&sc->aer_mtx);
698         while (!STAILQ_EMPTY(&sc->aer_list)) {
699                 aer = STAILQ_FIRST(&sc->aer_list);
700                 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
701                 free(aer);
702         }
703         pthread_mutex_unlock(&sc->aer_mtx);
704
705         pci_nvme_aer_reset(sc);
706 }
707
708 static bool
709 pci_nvme_aer_available(struct pci_nvme_softc *sc)
710 {
711
712         return (sc->aer_count != 0);
713 }
714
715 static bool
716 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
717 {
718         struct nvme_controller_data *cd = &sc->ctrldata;
719
720         /* AERL is a zero based value while aer_count is one's based */
721         return (sc->aer_count == (cd->aerl + 1));
722 }
723
724 /*
725  * Add an Async Event Request
726  *
727  * Stores an AER to be returned later if the Controller needs to notify the
728  * host of an event.
729  * Note that while the NVMe spec doesn't require Controllers to return AER's
730  * in order, this implementation does preserve the order.
731  */
732 static int
733 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
734 {
735         struct pci_nvme_aer *aer = NULL;
736
737         if (pci_nvme_aer_limit_reached(sc))
738                 return (-1);
739
740         aer = calloc(1, sizeof(struct pci_nvme_aer));
741         if (aer == NULL)
742                 return (-1);
743
744         /* Save the Command ID for use in the completion message */
745         aer->cid = cid;
746
747         pthread_mutex_lock(&sc->aer_mtx);
748         sc->aer_count++;
749         STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
750         pthread_mutex_unlock(&sc->aer_mtx);
751
752         return (0);
753 }
754
755 /*
756  * Get an Async Event Request structure
757  *
758  * Returns a pointer to an AER previously submitted by the host or NULL if
759  * no AER's exist. Caller is responsible for freeing the returned struct.
760  */
761 static struct pci_nvme_aer *
762 pci_nvme_aer_get(struct pci_nvme_softc *sc)
763 {
764         struct pci_nvme_aer *aer = NULL;
765
766         pthread_mutex_lock(&sc->aer_mtx);
767         aer = STAILQ_FIRST(&sc->aer_list);
768         if (aer != NULL) {
769                 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
770                 sc->aer_count--;
771         }
772         pthread_mutex_unlock(&sc->aer_mtx);
773         
774         return (aer);
775 }
776
777 static void
778 pci_nvme_aen_reset(struct pci_nvme_softc *sc)
779 {
780         uint32_t        atype;
781
782         memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen));
783
784         for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
785                 sc->aen[atype].atype = atype;
786         }
787 }
788
789 static void
790 pci_nvme_aen_init(struct pci_nvme_softc *sc)
791 {
792         char nstr[80];
793
794         pci_nvme_aen_reset(sc);
795
796         pthread_mutex_init(&sc->aen_mtx, NULL);
797         pthread_create(&sc->aen_tid, NULL, aen_thr, sc);
798         snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot,
799             sc->nsc_pi->pi_func);
800         pthread_set_name_np(sc->aen_tid, nstr);
801 }
802
803 static void
804 pci_nvme_aen_destroy(struct pci_nvme_softc *sc)
805 {
806
807         pci_nvme_aen_reset(sc);
808 }
809
810 /* Notify the AEN thread of pending work */
811 static void
812 pci_nvme_aen_notify(struct pci_nvme_softc *sc)
813 {
814
815         pthread_cond_signal(&sc->aen_cond);
816 }
817
818 /*
819  * Post an Asynchronous Event Notification
820  */
821 static int32_t
822 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype,
823                 uint32_t event_data)
824 {
825         struct pci_nvme_aen *aen;
826
827         if (atype >= PCI_NVME_AE_TYPE_MAX) {
828                 return(EINVAL);
829         }
830
831         pthread_mutex_lock(&sc->aen_mtx);
832         aen = &sc->aen[atype];
833
834         /* Has the controller already posted an event of this type? */
835         if (aen->posted) {
836                 pthread_mutex_unlock(&sc->aen_mtx);
837                 return(EALREADY);
838         }
839
840         aen->event_data = event_data;
841         aen->posted = true;
842         pthread_mutex_unlock(&sc->aen_mtx);
843
844         pci_nvme_aen_notify(sc);
845
846         return(0);
847 }
848
849 static void
850 pci_nvme_aen_process(struct pci_nvme_softc *sc)
851 {
852         struct pci_nvme_aer *aer;
853         struct pci_nvme_aen *aen;
854         pci_nvme_async_type atype;
855         uint32_t mask;
856         uint16_t status;
857         uint8_t lid;
858
859         assert(pthread_mutex_isowned_np(&sc->aen_mtx));
860         for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
861                 aen = &sc->aen[atype];
862                 /* Previous iterations may have depleted the available AER's */
863                 if (!pci_nvme_aer_available(sc)) {
864                         DPRINTF("%s: no AER", __func__);
865                         break;
866                 }
867
868                 if (!aen->posted) {
869                         DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype);
870                         continue;
871                 }
872
873                 status = NVME_SC_SUCCESS;
874
875                 /* Is the event masked? */
876                 mask =
877                     sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11;
878
879                 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data);
880                 switch (atype) {
881                 case PCI_NVME_AE_TYPE_ERROR:
882                         lid = NVME_LOG_ERROR;
883                         break;
884                 case PCI_NVME_AE_TYPE_SMART:
885                         mask &= 0xff;
886                         if ((mask & aen->event_data) == 0)
887                                 continue;
888                         lid = NVME_LOG_HEALTH_INFORMATION;
889                         break;
890                 case PCI_NVME_AE_TYPE_NOTICE:
891                         if (aen->event_data >= PCI_NVME_AE_INFO_MAX) {
892                                 EPRINTLN("%s unknown AEN notice type %u",
893                                     __func__, aen->event_data);
894                                 status = NVME_SC_INTERNAL_DEVICE_ERROR;
895                                 break;
896                         }
897                         mask >>= 8;
898                         if (((1 << aen->event_data) & mask) == 0)
899                                 continue;
900                         switch (aen->event_data) {
901                         case PCI_NVME_AE_INFO_NS_ATTR_CHANGED:
902                                 lid = NVME_LOG_CHANGED_NAMESPACE;
903                                 break;
904                         case PCI_NVME_AE_INFO_FW_ACTIVATION:
905                                 lid = NVME_LOG_FIRMWARE_SLOT;
906                                 break;
907                         case PCI_NVME_AE_INFO_TELEMETRY_CHANGE:
908                                 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED;
909                                 break;
910                         case PCI_NVME_AE_INFO_ANA_CHANGE:
911                                 lid = NVME_LOG_ASYMMETRIC_NAMESPAVE_ACCESS; //TODO spelling
912                                 break;
913                         case PCI_NVME_AE_INFO_PREDICT_LATENCY_CHANGE:
914                                 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE;
915                                 break;
916                         case PCI_NVME_AE_INFO_LBA_STATUS_ALERT:
917                                 lid = NVME_LOG_LBA_STATUS_INFORMATION;
918                                 break;
919                         case PCI_NVME_AE_INFO_ENDURANCE_GROUP_CHANGE:
920                                 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE;
921                                 break;
922                         default:
923                                 lid = 0;
924                         }
925                         break;
926                 default:
927                         /* bad type?!? */
928                         EPRINTLN("%s unknown AEN type %u", __func__, atype);
929                         status = NVME_SC_INTERNAL_DEVICE_ERROR;
930                         break;
931                 }
932
933                 aer = pci_nvme_aer_get(sc);
934                 assert(aer != NULL);
935
936                 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype);
937                 pci_nvme_cq_update(sc, &sc->compl_queues[0],
938                     (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */
939                     aer->cid,
940                     0,          /* SQID */
941                     status);
942
943                 aen->event_data = 0;
944                 aen->posted = false;
945
946                 pci_generate_msix(sc->nsc_pi, 0);
947         }
948 }
949
950 static void *
951 aen_thr(void *arg)
952 {
953         struct pci_nvme_softc *sc;
954
955         sc = arg;
956
957         pthread_mutex_lock(&sc->aen_mtx);
958         for (;;) {
959                 pci_nvme_aen_process(sc);
960                 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx);
961         }
962         pthread_mutex_unlock(&sc->aen_mtx);
963
964         pthread_exit(NULL);
965         return (NULL);
966 }
967
968 static void
969 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
970 {
971         uint32_t i;
972
973         DPRINTF("%s", __func__);
974
975         sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
976             (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
977             (60 << NVME_CAP_LO_REG_TO_SHIFT);
978
979         sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
980
981         sc->regs.vs = 0x00010300;       /* NVMe v1.3 */
982
983         sc->regs.cc = 0;
984         sc->regs.csts = 0;
985
986         assert(sc->submit_queues != NULL);
987
988         for (i = 0; i < sc->num_squeues + 1; i++) {
989                 sc->submit_queues[i].qbase = NULL;
990                 sc->submit_queues[i].size = 0;
991                 sc->submit_queues[i].cqid = 0;
992                 sc->submit_queues[i].tail = 0;
993                 sc->submit_queues[i].head = 0;
994         }
995
996         assert(sc->compl_queues != NULL);
997
998         for (i = 0; i < sc->num_cqueues + 1; i++) {
999                 sc->compl_queues[i].qbase = NULL;
1000                 sc->compl_queues[i].size = 0;
1001                 sc->compl_queues[i].tail = 0;
1002                 sc->compl_queues[i].head = 0;
1003         }
1004
1005         sc->num_q_is_set = false;
1006
1007         pci_nvme_aer_destroy(sc);
1008         pci_nvme_aen_destroy(sc);
1009 }
1010
1011 static void
1012 pci_nvme_reset(struct pci_nvme_softc *sc)
1013 {
1014         pthread_mutex_lock(&sc->mtx);
1015         pci_nvme_reset_locked(sc);
1016         pthread_mutex_unlock(&sc->mtx);
1017 }
1018
1019 static void
1020 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
1021 {
1022         uint16_t acqs, asqs;
1023
1024         DPRINTF("%s", __func__);
1025
1026         asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
1027         sc->submit_queues[0].size = asqs;
1028         sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
1029                     sizeof(struct nvme_command) * asqs);
1030
1031         DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
1032                 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
1033
1034         acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 
1035             NVME_AQA_REG_ACQS_MASK) + 1;
1036         sc->compl_queues[0].size = acqs;
1037         sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
1038                  sizeof(struct nvme_completion) * acqs);
1039         sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
1040
1041         DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
1042                 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
1043 }
1044
1045 static int
1046 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
1047         size_t len, enum nvme_copy_dir dir)
1048 {
1049         uint8_t *p;
1050         size_t bytes;
1051
1052         if (len > (8 * 1024)) {
1053                 return (-1);
1054         }
1055
1056         /* Copy from the start of prp1 to the end of the physical page */
1057         bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
1058         bytes = MIN(bytes, len);
1059
1060         p = vm_map_gpa(ctx, prp1, bytes);
1061         if (p == NULL) {
1062                 return (-1);
1063         }
1064
1065         if (dir == NVME_COPY_TO_PRP)
1066                 memcpy(p, b, bytes);
1067         else
1068                 memcpy(b, p, bytes);
1069
1070         b += bytes;
1071
1072         len -= bytes;
1073         if (len == 0) {
1074                 return (0);
1075         }
1076
1077         len = MIN(len, PAGE_SIZE);
1078
1079         p = vm_map_gpa(ctx, prp2, len);
1080         if (p == NULL) {
1081                 return (-1);
1082         }
1083
1084         if (dir == NVME_COPY_TO_PRP)
1085                 memcpy(p, b, len);
1086         else
1087                 memcpy(b, p, len);
1088
1089         return (0);
1090 }
1091
1092 /*
1093  * Write a Completion Queue Entry update
1094  *
1095  * Write the completion and update the doorbell value
1096  */
1097 static void
1098 pci_nvme_cq_update(struct pci_nvme_softc *sc,
1099                 struct nvme_completion_queue *cq,
1100                 uint32_t cdw0,
1101                 uint16_t cid,
1102                 uint16_t sqid,
1103                 uint16_t status)
1104 {
1105         struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
1106         struct nvme_completion *cqe;
1107
1108         assert(cq->qbase != NULL);
1109
1110         pthread_mutex_lock(&cq->mtx);
1111
1112         cqe = &cq->qbase[cq->tail];
1113
1114         /* Flip the phase bit */
1115         status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
1116
1117         cqe->cdw0 = cdw0;
1118         cqe->sqhd = sq->head;
1119         cqe->sqid = sqid;
1120         cqe->cid = cid;
1121         cqe->status = status;
1122
1123         cq->tail++;
1124         if (cq->tail >= cq->size) {
1125                 cq->tail = 0;
1126         }
1127
1128         pthread_mutex_unlock(&cq->mtx);
1129 }
1130
1131 static int
1132 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1133         struct nvme_completion* compl)
1134 {
1135         uint16_t qid = command->cdw10 & 0xffff;
1136
1137         DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
1138         if (qid == 0 || qid > sc->num_squeues ||
1139             (sc->submit_queues[qid].qbase == NULL)) {
1140                 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
1141                         __func__, qid, sc->num_squeues);
1142                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1143                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
1144                 return (1);
1145         }
1146
1147         sc->submit_queues[qid].qbase = NULL;
1148         sc->submit_queues[qid].cqid = 0;
1149         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1150         return (1);
1151 }
1152
1153 static int
1154 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1155         struct nvme_completion* compl)
1156 {
1157         if (command->cdw11 & NVME_CMD_CDW11_PC) {
1158                 uint16_t qid = command->cdw10 & 0xffff;
1159                 struct nvme_submission_queue *nsq;
1160
1161                 if ((qid == 0) || (qid > sc->num_squeues) ||
1162                     (sc->submit_queues[qid].qbase != NULL)) {
1163                         WPRINTF("%s queue index %u > num_squeues %u",
1164                                 __func__, qid, sc->num_squeues);
1165                         pci_nvme_status_tc(&compl->status,
1166                             NVME_SCT_COMMAND_SPECIFIC,
1167                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
1168                         return (1);
1169                 }
1170
1171                 nsq = &sc->submit_queues[qid];
1172                 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1173                 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
1174                 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
1175                         /*
1176                          * Queues must specify at least two entries
1177                          * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1178                          * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1179                          */
1180                         pci_nvme_status_tc(&compl->status,
1181                             NVME_SCT_COMMAND_SPECIFIC,
1182                             NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1183                         return (1);
1184                 }
1185                 nsq->head = nsq->tail = 0;
1186
1187                 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
1188                 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
1189                         pci_nvme_status_tc(&compl->status,
1190                             NVME_SCT_COMMAND_SPECIFIC,
1191                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
1192                         return (1);
1193                 }
1194
1195                 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
1196                         pci_nvme_status_tc(&compl->status,
1197                             NVME_SCT_COMMAND_SPECIFIC,
1198                             NVME_SC_COMPLETION_QUEUE_INVALID);
1199                         return (1);
1200                 }
1201
1202                 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
1203
1204                 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1205                               sizeof(struct nvme_command) * (size_t)nsq->size);
1206
1207                 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
1208                         qid, nsq->size, nsq->qbase, nsq->cqid);
1209
1210                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1211
1212                 DPRINTF("%s completed creating IOSQ qid %u",
1213                          __func__, qid);
1214         } else {
1215                 /* 
1216                  * Guest sent non-cont submission queue request.
1217                  * This setting is unsupported by this emulation.
1218                  */
1219                 WPRINTF("%s unsupported non-contig (list-based) "
1220                          "create i/o submission queue", __func__);
1221
1222                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1223         }
1224         return (1);
1225 }
1226
1227 static int
1228 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1229         struct nvme_completion* compl)
1230 {
1231         uint16_t qid = command->cdw10 & 0xffff;
1232         uint16_t sqid;
1233
1234         DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
1235         if (qid == 0 || qid > sc->num_cqueues ||
1236             (sc->compl_queues[qid].qbase == NULL)) {
1237                 WPRINTF("%s queue index %u / num_cqueues %u",
1238                         __func__, qid, sc->num_cqueues);
1239                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1240                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
1241                 return (1);
1242         }
1243
1244         /* Deleting an Active CQ is an error */
1245         for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
1246                 if (sc->submit_queues[sqid].cqid == qid) {
1247                         pci_nvme_status_tc(&compl->status,
1248                             NVME_SCT_COMMAND_SPECIFIC,
1249                             NVME_SC_INVALID_QUEUE_DELETION);
1250                         return (1);
1251                 }
1252
1253         sc->compl_queues[qid].qbase = NULL;
1254         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1255         return (1);
1256 }
1257
1258 static int
1259 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1260         struct nvme_completion* compl)
1261 {
1262         struct nvme_completion_queue *ncq;
1263         uint16_t qid = command->cdw10 & 0xffff;
1264
1265         /* Only support Physically Contiguous queues */
1266         if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1267                 WPRINTF("%s unsupported non-contig (list-based) "
1268                          "create i/o completion queue",
1269                          __func__);
1270
1271                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1272                 return (1);
1273         }
1274
1275         if ((qid == 0) || (qid > sc->num_cqueues) ||
1276             (sc->compl_queues[qid].qbase != NULL)) {
1277                 WPRINTF("%s queue index %u > num_cqueues %u",
1278                         __func__, qid, sc->num_cqueues);
1279                 pci_nvme_status_tc(&compl->status,
1280                     NVME_SCT_COMMAND_SPECIFIC,
1281                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
1282                 return (1);
1283         }
1284
1285         ncq = &sc->compl_queues[qid];
1286         ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1287         ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1288         if (ncq->intr_vec > (sc->max_queues + 1)) {
1289                 pci_nvme_status_tc(&compl->status,
1290                     NVME_SCT_COMMAND_SPECIFIC,
1291                     NVME_SC_INVALID_INTERRUPT_VECTOR);
1292                 return (1);
1293         }
1294
1295         ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1296         if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1297                 /*
1298                  * Queues must specify at least two entries
1299                  * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1300                  * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1301                  */
1302                 pci_nvme_status_tc(&compl->status,
1303                     NVME_SCT_COMMAND_SPECIFIC,
1304                     NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1305                 return (1);
1306         }
1307         ncq->head = ncq->tail = 0;
1308         ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1309                      command->prp1,
1310                      sizeof(struct nvme_command) * (size_t)ncq->size);
1311
1312         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1313
1314
1315         return (1);
1316 }
1317
1318 static int
1319 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1320         struct nvme_completion* compl)
1321 {
1322         uint32_t logsize;
1323         uint8_t logpage = command->cdw10 & 0xFF;
1324
1325         DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1326
1327         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1328
1329         /*
1330          * Command specifies the number of dwords to return in fields NUMDU
1331          * and NUMDL. This is a zero-based value.
1332          */
1333         logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1334         logsize *= sizeof(uint32_t);
1335
1336         switch (logpage) {
1337         case NVME_LOG_ERROR:
1338                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1339                     command->prp2, (uint8_t *)&sc->err_log,
1340                     MIN(logsize, sizeof(sc->err_log)),
1341                     NVME_COPY_TO_PRP);
1342                 break;
1343         case NVME_LOG_HEALTH_INFORMATION:
1344                 pthread_mutex_lock(&sc->mtx);
1345                 memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1346                     sizeof(sc->health_log.data_units_read));
1347                 memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1348                     sizeof(sc->health_log.data_units_written));
1349                 memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1350                     sizeof(sc->health_log.host_read_commands));
1351                 memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1352                     sizeof(sc->health_log.host_write_commands));
1353                 pthread_mutex_unlock(&sc->mtx);
1354
1355                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1356                     command->prp2, (uint8_t *)&sc->health_log,
1357                     MIN(logsize, sizeof(sc->health_log)),
1358                     NVME_COPY_TO_PRP);
1359                 break;
1360         case NVME_LOG_FIRMWARE_SLOT:
1361                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1362                     command->prp2, (uint8_t *)&sc->fw_log,
1363                     MIN(logsize, sizeof(sc->fw_log)),
1364                     NVME_COPY_TO_PRP);
1365                 break;
1366         default:
1367                 DPRINTF("%s get log page %x command not supported",
1368                         __func__, logpage);
1369
1370                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1371                     NVME_SC_INVALID_LOG_PAGE);
1372         }
1373
1374         return (1);
1375 }
1376
1377 static int
1378 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1379         struct nvme_completion* compl)
1380 {
1381         void *dest;
1382         uint16_t status;
1383
1384         DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1385                 command->cdw10 & 0xFF, command->nsid);
1386
1387         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1388
1389         switch (command->cdw10 & 0xFF) {
1390         case 0x00: /* return Identify Namespace data structure */
1391                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1392                     command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1393                     NVME_COPY_TO_PRP);
1394                 break;
1395         case 0x01: /* return Identify Controller data structure */
1396                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1397                     command->prp2, (uint8_t *)&sc->ctrldata,
1398                     sizeof(sc->ctrldata),
1399                     NVME_COPY_TO_PRP);
1400                 break;
1401         case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1402                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1403                                   sizeof(uint32_t) * 1024);
1404                 /* All unused entries shall be zero */
1405                 bzero(dest, sizeof(uint32_t) * 1024);
1406                 ((uint32_t *)dest)[0] = 1;
1407                 break;
1408         case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1409                 if (command->nsid != 1) {
1410                         pci_nvme_status_genc(&status,
1411                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1412                         break;
1413                 }
1414                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1415                                   sizeof(uint32_t) * 1024);
1416                 /* All bytes after the descriptor shall be zero */
1417                 bzero(dest, sizeof(uint32_t) * 1024);
1418
1419                 /* Return NIDT=1 (i.e. EUI64) descriptor */
1420                 ((uint8_t *)dest)[0] = 1;
1421                 ((uint8_t *)dest)[1] = sizeof(uint64_t);
1422                 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1423                 break;
1424         default:
1425                 DPRINTF("%s unsupported identify command requested 0x%x",
1426                          __func__, command->cdw10 & 0xFF);
1427                 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1428                 break;
1429         }
1430
1431         compl->status = status;
1432         return (1);
1433 }
1434
1435 static const char *
1436 nvme_fid_to_name(uint8_t fid)
1437 {
1438         const char *name;
1439
1440         switch (fid) {
1441         case NVME_FEAT_ARBITRATION:
1442                 name = "Arbitration";
1443                 break;
1444         case NVME_FEAT_POWER_MANAGEMENT:
1445                 name = "Power Management";
1446                 break;
1447         case NVME_FEAT_LBA_RANGE_TYPE:
1448                 name = "LBA Range Type";
1449                 break;
1450         case NVME_FEAT_TEMPERATURE_THRESHOLD:
1451                 name = "Temperature Threshold";
1452                 break;
1453         case NVME_FEAT_ERROR_RECOVERY:
1454                 name = "Error Recovery";
1455                 break;
1456         case NVME_FEAT_VOLATILE_WRITE_CACHE:
1457                 name = "Volatile Write Cache";
1458                 break;
1459         case NVME_FEAT_NUMBER_OF_QUEUES:
1460                 name = "Number of Queues";
1461                 break;
1462         case NVME_FEAT_INTERRUPT_COALESCING:
1463                 name = "Interrupt Coalescing";
1464                 break;
1465         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1466                 name = "Interrupt Vector Configuration";
1467                 break;
1468         case NVME_FEAT_WRITE_ATOMICITY:
1469                 name = "Write Atomicity Normal";
1470                 break;
1471         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1472                 name = "Asynchronous Event Configuration";
1473                 break;
1474         case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1475                 name = "Autonomous Power State Transition";
1476                 break;
1477         case NVME_FEAT_HOST_MEMORY_BUFFER:
1478                 name = "Host Memory Buffer";
1479                 break;
1480         case NVME_FEAT_TIMESTAMP:
1481                 name = "Timestamp";
1482                 break;
1483         case NVME_FEAT_KEEP_ALIVE_TIMER:
1484                 name = "Keep Alive Timer";
1485                 break;
1486         case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1487                 name = "Host Controlled Thermal Management";
1488                 break;
1489         case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1490                 name = "Non-Operation Power State Config";
1491                 break;
1492         case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1493                 name = "Read Recovery Level Config";
1494                 break;
1495         case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1496                 name = "Predictable Latency Mode Config";
1497                 break;
1498         case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1499                 name = "Predictable Latency Mode Window";
1500                 break;
1501         case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1502                 name = "LBA Status Information Report Interval";
1503                 break;
1504         case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1505                 name = "Host Behavior Support";
1506                 break;
1507         case NVME_FEAT_SANITIZE_CONFIG:
1508                 name = "Sanitize Config";
1509                 break;
1510         case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1511                 name = "Endurance Group Event Configuration";
1512                 break;
1513         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1514                 name = "Software Progress Marker";
1515                 break;
1516         case NVME_FEAT_HOST_IDENTIFIER:
1517                 name = "Host Identifier";
1518                 break;
1519         case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1520                 name = "Reservation Notification Mask";
1521                 break;
1522         case NVME_FEAT_RESERVATION_PERSISTENCE:
1523                 name = "Reservation Persistence";
1524                 break;
1525         case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1526                 name = "Namespace Write Protection Config";
1527                 break;
1528         default:
1529                 name = "Unknown";
1530                 break;
1531         }
1532
1533         return (name);
1534 }
1535
1536 static void
1537 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1538     struct nvme_feature_obj *feat,
1539     struct nvme_command *command,
1540     struct nvme_completion *compl)
1541 {
1542
1543         pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1544 }
1545
1546 static void
1547 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1548     struct nvme_feature_obj *feat,
1549     struct nvme_command *command,
1550     struct nvme_completion *compl)
1551 {
1552         uint32_t i;
1553         uint32_t cdw11 = command->cdw11;
1554         uint16_t iv;
1555         bool cd;
1556
1557         pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1558
1559         iv = cdw11 & 0xffff;
1560         cd = cdw11 & (1 << 16);
1561
1562         if (iv > (sc->max_queues + 1)) {
1563                 return;
1564         }
1565
1566         /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1567         if ((iv == 0) && !cd)
1568                 return;
1569
1570         /* Requested Interrupt Vector must be used by a CQ */
1571         for (i = 0; i < sc->num_cqueues + 1; i++) {
1572                 if (sc->compl_queues[i].intr_vec == iv) {
1573                         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1574                 }
1575         }
1576
1577 }
1578
1579 static void
1580 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1581     struct nvme_feature_obj *feat,
1582     struct nvme_command *command,
1583     struct nvme_completion *compl)
1584 {
1585         uint16_t nqr;   /* Number of Queues Requested */
1586
1587         if (sc->num_q_is_set) {
1588                 WPRINTF("%s: Number of Queues already set", __func__);
1589                 pci_nvme_status_genc(&compl->status,
1590                     NVME_SC_COMMAND_SEQUENCE_ERROR);
1591                 return;
1592         }
1593
1594         nqr = command->cdw11 & 0xFFFF;
1595         if (nqr == 0xffff) {
1596                 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1597                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1598                 return;
1599         }
1600
1601         sc->num_squeues = ONE_BASED(nqr);
1602         if (sc->num_squeues > sc->max_queues) {
1603                 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1604                                         sc->max_queues);
1605                 sc->num_squeues = sc->max_queues;
1606         }
1607
1608         nqr = (command->cdw11 >> 16) & 0xFFFF;
1609         if (nqr == 0xffff) {
1610                 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1611                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1612                 return;
1613         }
1614
1615         sc->num_cqueues = ONE_BASED(nqr);
1616         if (sc->num_cqueues > sc->max_queues) {
1617                 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1618                                         sc->max_queues);
1619                 sc->num_cqueues = sc->max_queues;
1620         }
1621
1622         /* Patch the command value which will be saved on callback's return */
1623         command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1624         compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1625
1626         sc->num_q_is_set = true;
1627 }
1628
1629 static int
1630 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1631         struct nvme_completion *compl)
1632 {
1633         struct nvme_feature_obj *feat;
1634         uint32_t nsid = command->nsid;
1635         uint8_t fid = command->cdw10 & 0xFF;
1636
1637         DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1638
1639         if (fid >= NVME_FID_MAX) {
1640                 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1641                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1642                 return (1);
1643         }
1644         feat = &sc->feat[fid];
1645
1646         if (!feat->namespace_specific &&
1647             !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1648                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1649                     NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1650                 return (1);
1651         }
1652
1653         compl->cdw0 = 0;
1654         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1655
1656         if (feat->set)
1657                 feat->set(sc, feat, command, compl);
1658
1659         DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11);
1660         if (compl->status == NVME_SC_SUCCESS) {
1661                 feat->cdw11 = command->cdw11;
1662                 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) &&
1663                     (command->cdw11 != 0))
1664                         pci_nvme_aen_notify(sc);
1665         }
1666
1667         return (0);
1668 }
1669
1670 static int
1671 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1672         struct nvme_completion* compl)
1673 {
1674         struct nvme_feature_obj *feat;
1675         uint8_t fid = command->cdw10 & 0xFF;
1676
1677         DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1678
1679         if (fid >= NVME_FID_MAX) {
1680                 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1681                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1682                 return (1);
1683         }
1684
1685         compl->cdw0 = 0;
1686         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1687
1688         feat = &sc->feat[fid];
1689         if (feat->get) {
1690                 feat->get(sc, feat, command, compl);
1691         }
1692
1693         if (compl->status == NVME_SC_SUCCESS) {
1694                 compl->cdw0 = feat->cdw11;
1695         }
1696
1697         return (0);
1698 }
1699
1700 static int
1701 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1702         struct nvme_completion* compl)
1703 {
1704         uint8_t ses, lbaf, pi;
1705
1706         /* Only supports Secure Erase Setting - User Data Erase */
1707         ses = (command->cdw10 >> 9) & 0x7;
1708         if (ses > 0x1) {
1709                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1710                 return (1);
1711         }
1712
1713         /* Only supports a single LBA Format */
1714         lbaf = command->cdw10 & 0xf;
1715         if (lbaf != 0) {
1716                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1717                     NVME_SC_INVALID_FORMAT);
1718                 return (1);
1719         }
1720
1721         /* Doesn't support Protection Infomation */
1722         pi = (command->cdw10 >> 5) & 0x7;
1723         if (pi != 0) {
1724                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1725                 return (1);
1726         }
1727
1728         if (sc->nvstore.type == NVME_STOR_RAM) {
1729                 if (sc->nvstore.ctx)
1730                         free(sc->nvstore.ctx);
1731                 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1732                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1733         } else {
1734                 struct pci_nvme_ioreq *req;
1735                 int err;
1736
1737                 req = pci_nvme_get_ioreq(sc);
1738                 if (req == NULL) {
1739                         pci_nvme_status_genc(&compl->status,
1740                             NVME_SC_INTERNAL_DEVICE_ERROR);
1741                         WPRINTF("%s: unable to allocate IO req", __func__);
1742                         return (1);
1743                 }
1744                 req->nvme_sq = &sc->submit_queues[0];
1745                 req->sqid = 0;
1746                 req->opc = command->opc;
1747                 req->cid = command->cid;
1748                 req->nsid = command->nsid;
1749
1750                 req->io_req.br_offset = 0;
1751                 req->io_req.br_resid = sc->nvstore.size;
1752                 req->io_req.br_callback = pci_nvme_io_done;
1753
1754                 err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1755                 if (err) {
1756                         pci_nvme_status_genc(&compl->status,
1757                             NVME_SC_INTERNAL_DEVICE_ERROR);
1758                         pci_nvme_release_ioreq(sc, req);
1759                 }
1760         }
1761
1762         return (1);
1763 }
1764
1765 static int
1766 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1767         struct nvme_completion* compl)
1768 {
1769         DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1770                 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1771
1772         /* TODO: search for the command ID and abort it */
1773
1774         compl->cdw0 = 1;
1775         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1776         return (1);
1777 }
1778
1779 static int
1780 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1781         struct nvme_command* command, struct nvme_completion* compl)
1782 {
1783         DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__,
1784             sc->aer_count, sc->ctrldata.aerl, command->cid);
1785
1786         /* Don't exceed the Async Event Request Limit (AERL). */
1787         if (pci_nvme_aer_limit_reached(sc)) {
1788                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1789                                 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1790                 return (1);
1791         }
1792
1793         if (pci_nvme_aer_add(sc, command->cid)) {
1794                 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1795                                 NVME_SC_INTERNAL_DEVICE_ERROR);
1796                 return (1);
1797         }
1798
1799         /*
1800          * Raise events when they happen based on the Set Features cmd.
1801          * These events happen async, so only set completion successful if
1802          * there is an event reflective of the request to get event.
1803          */
1804         compl->status = NVME_NO_STATUS;
1805         pci_nvme_aen_notify(sc);
1806
1807         return (0);
1808 }
1809
1810 static void
1811 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1812 {
1813         struct nvme_completion compl;
1814         struct nvme_command *cmd;
1815         struct nvme_submission_queue *sq;
1816         struct nvme_completion_queue *cq;
1817         uint16_t sqhead;
1818
1819         DPRINTF("%s index %u", __func__, (uint32_t)value);
1820
1821         sq = &sc->submit_queues[0];
1822         cq = &sc->compl_queues[0];
1823
1824         pthread_mutex_lock(&sq->mtx);
1825
1826         sqhead = sq->head;
1827         DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1828         
1829         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1830                 cmd = &(sq->qbase)[sqhead];
1831                 compl.cdw0 = 0;
1832                 compl.status = 0;
1833
1834                 switch (cmd->opc) {
1835                 case NVME_OPC_DELETE_IO_SQ:
1836                         DPRINTF("%s command DELETE_IO_SQ", __func__);
1837                         nvme_opc_delete_io_sq(sc, cmd, &compl);
1838                         break;
1839                 case NVME_OPC_CREATE_IO_SQ:
1840                         DPRINTF("%s command CREATE_IO_SQ", __func__);
1841                         nvme_opc_create_io_sq(sc, cmd, &compl);
1842                         break;
1843                 case NVME_OPC_DELETE_IO_CQ:
1844                         DPRINTF("%s command DELETE_IO_CQ", __func__);
1845                         nvme_opc_delete_io_cq(sc, cmd, &compl);
1846                         break;
1847                 case NVME_OPC_CREATE_IO_CQ:
1848                         DPRINTF("%s command CREATE_IO_CQ", __func__);
1849                         nvme_opc_create_io_cq(sc, cmd, &compl);
1850                         break;
1851                 case NVME_OPC_GET_LOG_PAGE:
1852                         DPRINTF("%s command GET_LOG_PAGE", __func__);
1853                         nvme_opc_get_log_page(sc, cmd, &compl);
1854                         break;
1855                 case NVME_OPC_IDENTIFY:
1856                         DPRINTF("%s command IDENTIFY", __func__);
1857                         nvme_opc_identify(sc, cmd, &compl);
1858                         break;
1859                 case NVME_OPC_ABORT:
1860                         DPRINTF("%s command ABORT", __func__);
1861                         nvme_opc_abort(sc, cmd, &compl);
1862                         break;
1863                 case NVME_OPC_SET_FEATURES:
1864                         DPRINTF("%s command SET_FEATURES", __func__);
1865                         nvme_opc_set_features(sc, cmd, &compl);
1866                         break;
1867                 case NVME_OPC_GET_FEATURES:
1868                         DPRINTF("%s command GET_FEATURES", __func__);
1869                         nvme_opc_get_features(sc, cmd, &compl);
1870                         break;
1871                 case NVME_OPC_FIRMWARE_ACTIVATE:
1872                         DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
1873                         pci_nvme_status_tc(&compl.status,
1874                             NVME_SCT_COMMAND_SPECIFIC,
1875                             NVME_SC_INVALID_FIRMWARE_SLOT);
1876                         break;
1877                 case NVME_OPC_ASYNC_EVENT_REQUEST:
1878                         DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1879                         nvme_opc_async_event_req(sc, cmd, &compl);
1880                         break;
1881                 case NVME_OPC_FORMAT_NVM:
1882                         DPRINTF("%s command FORMAT_NVM", __func__);
1883                         if ((sc->ctrldata.oacs &
1884                             (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1885                                 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1886                         }
1887                         compl.status = NVME_NO_STATUS;
1888                         nvme_opc_format_nvm(sc, cmd, &compl);
1889                         break;
1890                 default:
1891                         DPRINTF("0x%x command is not implemented",
1892                             cmd->opc);
1893                         pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1894                 }
1895                 sqhead = (sqhead + 1) % sq->size;
1896
1897                 if (NVME_COMPLETION_VALID(compl)) {
1898                         pci_nvme_cq_update(sc, &sc->compl_queues[0],
1899                             compl.cdw0,
1900                             cmd->cid,
1901                             0,          /* SQID */
1902                             compl.status);
1903                 }
1904         }
1905
1906         DPRINTF("setting sqhead %u", sqhead);
1907         sq->head = sqhead;
1908
1909         if (cq->head != cq->tail)
1910                 pci_generate_msix(sc->nsc_pi, 0);
1911
1912         pthread_mutex_unlock(&sq->mtx);
1913 }
1914
1915 /*
1916  * Update the Write and Read statistics reported in SMART data
1917  *
1918  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
1919  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
1920  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
1921  */
1922 static void
1923 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
1924     size_t bytes, uint16_t status)
1925 {
1926
1927         pthread_mutex_lock(&sc->mtx);
1928         switch (opc) {
1929         case NVME_OPC_WRITE:
1930                 sc->write_commands++;
1931                 if (status != NVME_SC_SUCCESS)
1932                         break;
1933                 sc->write_dunits_remainder += (bytes / 512);
1934                 while (sc->write_dunits_remainder >= 1000) {
1935                         sc->write_data_units++;
1936                         sc->write_dunits_remainder -= 1000;
1937                 }
1938                 break;
1939         case NVME_OPC_READ:
1940                 sc->read_commands++;
1941                 if (status != NVME_SC_SUCCESS)
1942                         break;
1943                 sc->read_dunits_remainder += (bytes / 512);
1944                 while (sc->read_dunits_remainder >= 1000) {
1945                         sc->read_data_units++;
1946                         sc->read_dunits_remainder -= 1000;
1947                 }
1948                 break;
1949         default:
1950                 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
1951                 break;
1952         }
1953         pthread_mutex_unlock(&sc->mtx);
1954 }
1955
1956 /*
1957  * Check if the combination of Starting LBA (slba) and Number of Logical
1958  * Blocks (nlb) exceeds the range of the underlying storage.
1959  *
1960  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
1961  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
1962  * overflow.
1963  */
1964 static bool
1965 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
1966     uint32_t nlb)
1967 {
1968         size_t  offset, bytes;
1969
1970         /* Overflow check of multiplying Starting LBA by the sector size */
1971         if (slba >> (64 - nvstore->sectsz_bits))
1972                 return (true);
1973
1974         offset = slba << nvstore->sectsz_bits;
1975         bytes = nlb << nvstore->sectsz_bits;
1976
1977         /* Overflow check of Number of Logical Blocks */
1978         if ((nvstore->size - offset) < bytes)
1979                 return (true);
1980
1981         return (false);
1982 }
1983
1984 static int
1985 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1986         uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1987 {
1988         int iovidx;
1989
1990         if (req == NULL)
1991                 return (-1);
1992
1993         if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
1994                 return (-1);
1995         }
1996
1997         /* concatenate contig block-iovs to minimize number of iovs */
1998         if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1999                 iovidx = req->io_req.br_iovcnt - 1;
2000
2001                 req->io_req.br_iov[iovidx].iov_base =
2002                     paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2003                                      req->prev_gpaddr, size);
2004
2005                 req->prev_size += size;
2006                 req->io_req.br_resid += size;
2007
2008                 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
2009         } else {
2010                 iovidx = req->io_req.br_iovcnt;
2011                 if (iovidx == 0) {
2012                         req->io_req.br_offset = lba;
2013                         req->io_req.br_resid = 0;
2014                         req->io_req.br_param = req;
2015                 }
2016
2017                 req->io_req.br_iov[iovidx].iov_base =
2018                     paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2019                                      gpaddr, size);
2020
2021                 req->io_req.br_iov[iovidx].iov_len = size;
2022
2023                 req->prev_gpaddr = gpaddr;
2024                 req->prev_size = size;
2025                 req->io_req.br_resid += size;
2026
2027                 req->io_req.br_iovcnt++;
2028         }
2029
2030         return (0);
2031 }
2032
2033 static void
2034 pci_nvme_set_completion(struct pci_nvme_softc *sc,
2035         struct nvme_submission_queue *sq, int sqid, uint16_t cid,
2036         uint32_t cdw0, uint16_t status)
2037 {
2038         struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
2039
2040         DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
2041                  __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
2042                  NVME_STATUS_GET_SC(status));
2043
2044         pci_nvme_cq_update(sc, cq,
2045             0,          /* CDW0 */
2046             cid,
2047             sqid,
2048             status);
2049
2050         if (cq->head != cq->tail) {
2051                 if (cq->intr_en & NVME_CQ_INTEN) {
2052                         pci_generate_msix(sc->nsc_pi, cq->intr_vec);
2053                 } else {
2054                         DPRINTF("%s: CQ%u interrupt disabled",
2055                                                 __func__, sq->cqid);
2056                 }
2057         }
2058 }
2059
2060 static void
2061 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
2062 {
2063         req->sc = NULL;
2064         req->nvme_sq = NULL;
2065         req->sqid = 0;
2066
2067         pthread_mutex_lock(&sc->mtx);
2068
2069         STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
2070         sc->pending_ios--;
2071
2072         /* when no more IO pending, can set to ready if device reset/enabled */
2073         if (sc->pending_ios == 0 &&
2074             NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
2075                 sc->regs.csts |= NVME_CSTS_RDY;
2076
2077         pthread_mutex_unlock(&sc->mtx);
2078
2079         sem_post(&sc->iosemlock);
2080 }
2081
2082 static struct pci_nvme_ioreq *
2083 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
2084 {
2085         struct pci_nvme_ioreq *req = NULL;
2086
2087         sem_wait(&sc->iosemlock);
2088         pthread_mutex_lock(&sc->mtx);
2089
2090         req = STAILQ_FIRST(&sc->ioreqs_free);
2091         assert(req != NULL);
2092         STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
2093
2094         req->sc = sc;
2095
2096         sc->pending_ios++;
2097
2098         pthread_mutex_unlock(&sc->mtx);
2099
2100         req->io_req.br_iovcnt = 0;
2101         req->io_req.br_offset = 0;
2102         req->io_req.br_resid = 0;
2103         req->io_req.br_param = req;
2104         req->prev_gpaddr = 0;
2105         req->prev_size = 0;
2106
2107         return req;
2108 }
2109
2110 static void
2111 pci_nvme_io_done(struct blockif_req *br, int err)
2112 {
2113         struct pci_nvme_ioreq *req = br->br_param;
2114         struct nvme_submission_queue *sq = req->nvme_sq;
2115         uint16_t code, status;
2116
2117         DPRINTF("%s error %d %s", __func__, err, strerror(err));
2118
2119         /* TODO return correct error */
2120         code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
2121         pci_nvme_status_genc(&status, code);
2122
2123         pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
2124         pci_nvme_stats_write_read_update(req->sc, req->opc,
2125             req->bytes, status);
2126         pci_nvme_release_ioreq(req->sc, req);
2127 }
2128
2129 /*
2130  * Implements the Flush command. The specification states:
2131  *    If a volatile write cache is not present, Flush commands complete
2132  *    successfully and have no effect
2133  * in the description of the Volatile Write Cache (VWC) field of the Identify
2134  * Controller data. Therefore, set status to Success if the command is
2135  * not supported (i.e. RAM or as indicated by the blockif).
2136  */
2137 static bool
2138 nvme_opc_flush(struct pci_nvme_softc *sc,
2139     struct nvme_command *cmd,
2140     struct pci_nvme_blockstore *nvstore,
2141     struct pci_nvme_ioreq *req,
2142     uint16_t *status)
2143 {
2144         bool pending = false;
2145
2146         if (nvstore->type == NVME_STOR_RAM) {
2147                 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2148         } else {
2149                 int err;
2150
2151                 req->io_req.br_callback = pci_nvme_io_done;
2152
2153                 err = blockif_flush(nvstore->ctx, &req->io_req);
2154                 switch (err) {
2155                 case 0:
2156                         pending = true;
2157                         break;
2158                 case EOPNOTSUPP:
2159                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2160                         break;
2161                 default:
2162                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2163                 }
2164         }
2165
2166         return (pending);
2167 }
2168
2169 static uint16_t
2170 nvme_write_read_ram(struct pci_nvme_softc *sc,
2171     struct pci_nvme_blockstore *nvstore,
2172     uint64_t prp1, uint64_t prp2,
2173     size_t offset, uint64_t bytes,
2174     bool is_write)
2175 {
2176         uint8_t *buf = nvstore->ctx;
2177         enum nvme_copy_dir dir;
2178         uint16_t status;
2179
2180         if (is_write)
2181                 dir = NVME_COPY_TO_PRP;
2182         else
2183                 dir = NVME_COPY_FROM_PRP;
2184
2185         if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
2186             buf + offset, bytes, dir))
2187                 pci_nvme_status_genc(&status,
2188                     NVME_SC_DATA_TRANSFER_ERROR);
2189         else
2190                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2191
2192         return (status);
2193 }
2194
2195 static uint16_t
2196 nvme_write_read_blockif(struct pci_nvme_softc *sc,
2197     struct pci_nvme_blockstore *nvstore,
2198     struct pci_nvme_ioreq *req,
2199     uint64_t prp1, uint64_t prp2,
2200     size_t offset, uint64_t bytes,
2201     bool is_write)
2202 {
2203         uint64_t size;
2204         int err;
2205         uint16_t status = NVME_NO_STATUS;
2206
2207         size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
2208         if (pci_nvme_append_iov_req(sc, req, prp1,
2209             size, is_write, offset)) {
2210                 pci_nvme_status_genc(&status,
2211                     NVME_SC_DATA_TRANSFER_ERROR);
2212                 goto out;
2213         }
2214
2215         offset += size;
2216         bytes  -= size;
2217
2218         if (bytes == 0) {
2219                 ;
2220         } else if (bytes <= PAGE_SIZE) {
2221                 size = bytes;
2222                 if (pci_nvme_append_iov_req(sc, req, prp2,
2223                     size, is_write, offset)) {
2224                         pci_nvme_status_genc(&status,
2225                             NVME_SC_DATA_TRANSFER_ERROR);
2226                         goto out;
2227                 }
2228         } else {
2229                 void *vmctx = sc->nsc_pi->pi_vmctx;
2230                 uint64_t *prp_list = &prp2;
2231                 uint64_t *last = prp_list;
2232
2233                 /* PRP2 is pointer to a physical region page list */
2234                 while (bytes) {
2235                         /* Last entry in list points to the next list */
2236                         if ((prp_list == last) && (bytes > PAGE_SIZE)) {
2237                                 uint64_t prp = *prp_list;
2238
2239                                 prp_list = paddr_guest2host(vmctx, prp,
2240                                     PAGE_SIZE - (prp % PAGE_SIZE));
2241                                 last = prp_list + (NVME_PRP2_ITEMS - 1);
2242                         }
2243
2244                         size = MIN(bytes, PAGE_SIZE);
2245
2246                         if (pci_nvme_append_iov_req(sc, req, *prp_list,
2247                             size, is_write, offset)) {
2248                                 pci_nvme_status_genc(&status,
2249                                     NVME_SC_DATA_TRANSFER_ERROR);
2250                                 goto out;
2251                         }
2252
2253                         offset += size;
2254                         bytes  -= size;
2255
2256                         prp_list++;
2257                 }
2258         }
2259         req->io_req.br_callback = pci_nvme_io_done;
2260         if (is_write)
2261                 err = blockif_write(nvstore->ctx, &req->io_req);
2262         else
2263                 err = blockif_read(nvstore->ctx, &req->io_req);
2264
2265         if (err)
2266                 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2267 out:
2268         return (status);
2269 }
2270
2271 static bool
2272 nvme_opc_write_read(struct pci_nvme_softc *sc,
2273     struct nvme_command *cmd,
2274     struct pci_nvme_blockstore *nvstore,
2275     struct pci_nvme_ioreq *req,
2276     uint16_t *status)
2277 {
2278         uint64_t lba, nblocks, bytes;
2279         size_t offset;
2280         bool is_write = cmd->opc == NVME_OPC_WRITE;
2281         bool pending = false;
2282
2283         lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2284         nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2285
2286         if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2287                 WPRINTF("%s command would exceed LBA range", __func__);
2288                 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2289                 goto out;
2290         }
2291
2292         bytes  = nblocks << nvstore->sectsz_bits;
2293         if (bytes > NVME_MAX_DATA_SIZE) {
2294                 WPRINTF("%s command would exceed MDTS", __func__);
2295                 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2296                 goto out;
2297         }
2298
2299         offset = lba << nvstore->sectsz_bits;
2300
2301         req->bytes = bytes;
2302         req->io_req.br_offset = lba;
2303
2304         /* PRP bits 1:0 must be zero */
2305         cmd->prp1 &= ~0x3UL;
2306         cmd->prp2 &= ~0x3UL;
2307
2308         if (nvstore->type == NVME_STOR_RAM) {
2309                 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2310                     cmd->prp2, offset, bytes, is_write);
2311         } else {
2312                 *status = nvme_write_read_blockif(sc, nvstore, req,
2313                     cmd->prp1, cmd->prp2, offset, bytes, is_write);
2314
2315                 if (*status == NVME_NO_STATUS)
2316                         pending = true;
2317         }
2318 out:
2319         if (!pending)
2320                 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2321
2322         return (pending);
2323 }
2324
2325 static void
2326 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2327 {
2328         struct pci_nvme_ioreq *req = br->br_param;
2329         struct pci_nvme_softc *sc = req->sc;
2330         bool done = true;
2331         uint16_t status;
2332
2333         if (err) {
2334                 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2335         } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2336                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2337         } else {
2338                 struct iovec *iov = req->io_req.br_iov;
2339
2340                 req->prev_gpaddr++;
2341                 iov += req->prev_gpaddr;
2342
2343                 /* The iov_* values already include the sector size */
2344                 req->io_req.br_offset = (off_t)iov->iov_base;
2345                 req->io_req.br_resid = iov->iov_len;
2346                 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2347                         pci_nvme_status_genc(&status,
2348                             NVME_SC_INTERNAL_DEVICE_ERROR);
2349                 } else
2350                         done = false;
2351         }
2352
2353         if (done) {
2354                 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2355                     req->cid, 0, status);
2356                 pci_nvme_release_ioreq(sc, req);
2357         }
2358 }
2359
2360 static bool
2361 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2362     struct nvme_command *cmd,
2363     struct pci_nvme_blockstore *nvstore,
2364     struct pci_nvme_ioreq *req,
2365     uint16_t *status)
2366 {
2367         struct nvme_dsm_range *range;
2368         uint32_t nr, r, non_zero, dr;
2369         int err;
2370         bool pending = false;
2371
2372         if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2373                 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2374                 goto out;
2375         }
2376
2377         nr = cmd->cdw10 & 0xff;
2378
2379         /* copy locally because a range entry could straddle PRPs */
2380         range = calloc(1, NVME_MAX_DSM_TRIM);
2381         if (range == NULL) {
2382                 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2383                 goto out;
2384         }
2385         nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2386             (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2387
2388         /* Check for invalid ranges and the number of non-zero lengths */
2389         non_zero = 0;
2390         for (r = 0; r <= nr; r++) {
2391                 if (pci_nvme_out_of_range(nvstore,
2392                     range[r].starting_lba, range[r].length)) {
2393                         pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2394                         goto out;
2395                 }
2396                 if (range[r].length != 0)
2397                         non_zero++;
2398         }
2399
2400         if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2401                 size_t offset, bytes;
2402                 int sectsz_bits = sc->nvstore.sectsz_bits;
2403
2404                 /*
2405                  * DSM calls are advisory only, and compliant controllers
2406                  * may choose to take no actions (i.e. return Success).
2407                  */
2408                 if (!nvstore->deallocate) {
2409                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2410                         goto out;
2411                 }
2412
2413                 /* If all ranges have a zero length, return Success */
2414                 if (non_zero == 0) {
2415                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2416                         goto out;
2417                 }
2418
2419                 if (req == NULL) {
2420                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2421                         goto out;
2422                 }
2423
2424                 offset = range[0].starting_lba << sectsz_bits;
2425                 bytes = range[0].length << sectsz_bits;
2426
2427                 /*
2428                  * If the request is for more than a single range, store
2429                  * the ranges in the br_iov. Optimize for the common case
2430                  * of a single range.
2431                  *
2432                  * Note that NVMe Number of Ranges is a zero based value
2433                  */
2434                 req->io_req.br_iovcnt = 0;
2435                 req->io_req.br_offset = offset;
2436                 req->io_req.br_resid = bytes;
2437
2438                 if (nr == 0) {
2439                         req->io_req.br_callback = pci_nvme_io_done;
2440                 } else {
2441                         struct iovec *iov = req->io_req.br_iov;
2442
2443                         for (r = 0, dr = 0; r <= nr; r++) {
2444                                 offset = range[r].starting_lba << sectsz_bits;
2445                                 bytes = range[r].length << sectsz_bits;
2446                                 if (bytes == 0)
2447                                         continue;
2448
2449                                 if ((nvstore->size - offset) < bytes) {
2450                                         pci_nvme_status_genc(status,
2451                                             NVME_SC_LBA_OUT_OF_RANGE);
2452                                         goto out;
2453                                 }
2454                                 iov[dr].iov_base = (void *)offset;
2455                                 iov[dr].iov_len = bytes;
2456                                 dr++;
2457                         }
2458                         req->io_req.br_callback = pci_nvme_dealloc_sm;
2459
2460                         /*
2461                          * Use prev_gpaddr to track the current entry and
2462                          * prev_size to track the number of entries
2463                          */
2464                         req->prev_gpaddr = 0;
2465                         req->prev_size = dr;
2466                 }
2467
2468                 err = blockif_delete(nvstore->ctx, &req->io_req);
2469                 if (err)
2470                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2471                 else
2472                         pending = true;
2473         }
2474 out:
2475         free(range);
2476         return (pending);
2477 }
2478
2479 static void
2480 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2481 {
2482         struct nvme_submission_queue *sq;
2483         uint16_t status;
2484         uint16_t sqhead;
2485
2486         /* handle all submissions up to sq->tail index */
2487         sq = &sc->submit_queues[idx];
2488
2489         pthread_mutex_lock(&sq->mtx);
2490
2491         sqhead = sq->head;
2492         DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2493                  idx, sqhead, sq->tail, sq->qbase);
2494
2495         while (sqhead != atomic_load_acq_short(&sq->tail)) {
2496                 struct nvme_command *cmd;
2497                 struct pci_nvme_ioreq *req;
2498                 uint32_t nsid;
2499                 bool pending;
2500
2501                 pending = false;
2502                 req = NULL;
2503                 status = 0;
2504
2505                 cmd = &sq->qbase[sqhead];
2506                 sqhead = (sqhead + 1) % sq->size;
2507
2508                 nsid = le32toh(cmd->nsid);
2509                 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2510                         pci_nvme_status_genc(&status,
2511                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2512                         status |=
2513                             NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2514                         goto complete;
2515                 }
2516
2517                 req = pci_nvme_get_ioreq(sc);
2518                 if (req == NULL) {
2519                         pci_nvme_status_genc(&status,
2520                             NVME_SC_INTERNAL_DEVICE_ERROR);
2521                         WPRINTF("%s: unable to allocate IO req", __func__);
2522                         goto complete;
2523                 }
2524                 req->nvme_sq = sq;
2525                 req->sqid = idx;
2526                 req->opc = cmd->opc;
2527                 req->cid = cmd->cid;
2528                 req->nsid = cmd->nsid;
2529
2530                 switch (cmd->opc) {
2531                 case NVME_OPC_FLUSH:
2532                         pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2533                             req, &status);
2534                         break;
2535                 case NVME_OPC_WRITE:
2536                 case NVME_OPC_READ:
2537                         pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2538                             req, &status);
2539                         break;
2540                 case NVME_OPC_WRITE_ZEROES:
2541                         /* TODO: write zeroes
2542                         WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2543                                 __func__, lba, cmd->cdw12 & 0xFFFF); */
2544                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2545                         break;
2546                 case NVME_OPC_DATASET_MANAGEMENT:
2547                         pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2548                             req, &status);
2549                         break;
2550                 default:
2551                         WPRINTF("%s unhandled io command 0x%x",
2552                             __func__, cmd->opc);
2553                         pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2554                 }
2555 complete:
2556                 if (!pending) {
2557                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2558                             status);
2559                         if (req != NULL)
2560                                 pci_nvme_release_ioreq(sc, req);
2561                 }
2562         }
2563
2564         sq->head = sqhead;
2565
2566         pthread_mutex_unlock(&sq->mtx);
2567 }
2568
2569 static void
2570 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2571         uint64_t idx, int is_sq, uint64_t value)
2572 {
2573         DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2574                 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2575
2576         if (is_sq) {
2577                 if (idx > sc->num_squeues) {
2578                         WPRINTF("%s queue index %lu overflow from "
2579                                  "guest (max %u)",
2580                                  __func__, idx, sc->num_squeues);
2581                         return;
2582                 }
2583
2584                 atomic_store_short(&sc->submit_queues[idx].tail,
2585                                    (uint16_t)value);
2586
2587                 if (idx == 0) {
2588                         pci_nvme_handle_admin_cmd(sc, value);
2589                 } else {
2590                         /* submission queue; handle new entries in SQ */
2591                         if (idx > sc->num_squeues) {
2592                                 WPRINTF("%s SQ index %lu overflow from "
2593                                          "guest (max %u)",
2594                                          __func__, idx, sc->num_squeues);
2595                                 return;
2596                         }
2597                         pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2598                 }
2599         } else {
2600                 if (idx > sc->num_cqueues) {
2601                         WPRINTF("%s queue index %lu overflow from "
2602                                  "guest (max %u)",
2603                                  __func__, idx, sc->num_cqueues);
2604                         return;
2605                 }
2606
2607                 atomic_store_short(&sc->compl_queues[idx].head,
2608                                 (uint16_t)value);
2609         }
2610 }
2611
2612 static void
2613 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2614 {
2615         const char *s = iswrite ? "WRITE" : "READ";
2616
2617         switch (offset) {
2618         case NVME_CR_CAP_LOW:
2619                 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2620                 break;
2621         case NVME_CR_CAP_HI:
2622                 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2623                 break;
2624         case NVME_CR_VS:
2625                 DPRINTF("%s %s NVME_CR_VS", func, s);
2626                 break;
2627         case NVME_CR_INTMS:
2628                 DPRINTF("%s %s NVME_CR_INTMS", func, s);
2629                 break;
2630         case NVME_CR_INTMC:
2631                 DPRINTF("%s %s NVME_CR_INTMC", func, s);
2632                 break;
2633         case NVME_CR_CC:
2634                 DPRINTF("%s %s NVME_CR_CC", func, s);
2635                 break;
2636         case NVME_CR_CSTS:
2637                 DPRINTF("%s %s NVME_CR_CSTS", func, s);
2638                 break;
2639         case NVME_CR_NSSR:
2640                 DPRINTF("%s %s NVME_CR_NSSR", func, s);
2641                 break;
2642         case NVME_CR_AQA:
2643                 DPRINTF("%s %s NVME_CR_AQA", func, s);
2644                 break;
2645         case NVME_CR_ASQ_LOW:
2646                 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2647                 break;
2648         case NVME_CR_ASQ_HI:
2649                 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2650                 break;
2651         case NVME_CR_ACQ_LOW:
2652                 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2653                 break;
2654         case NVME_CR_ACQ_HI:
2655                 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2656                 break;
2657         default:
2658                 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2659         }
2660
2661 }
2662
2663 static void
2664 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2665         uint64_t offset, int size, uint64_t value)
2666 {
2667         uint32_t ccreg;
2668
2669         if (offset >= NVME_DOORBELL_OFFSET) {
2670                 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2671                 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2672                 int is_sq = (belloffset % 8) < 4;
2673
2674                 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2675                         WPRINTF("guest attempted an overflow write offset "
2676                                  "0x%lx, val 0x%lx in %s",
2677                                  offset, value, __func__);
2678                         return;
2679                 }
2680
2681                 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2682                 return;
2683         }
2684
2685         DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2686                 offset, size, value);
2687
2688         if (size != 4) {
2689                 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2690                          "val 0x%lx) to bar0 in %s",
2691                          size, offset, value, __func__);
2692                 /* TODO: shutdown device */
2693                 return;
2694         }
2695
2696         pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2697
2698         pthread_mutex_lock(&sc->mtx);
2699
2700         switch (offset) {
2701         case NVME_CR_CAP_LOW:
2702         case NVME_CR_CAP_HI:
2703                 /* readonly */
2704                 break;
2705         case NVME_CR_VS:
2706                 /* readonly */
2707                 break;
2708         case NVME_CR_INTMS:
2709                 /* MSI-X, so ignore */
2710                 break;
2711         case NVME_CR_INTMC:
2712                 /* MSI-X, so ignore */
2713                 break;
2714         case NVME_CR_CC:
2715                 ccreg = (uint32_t)value;
2716
2717                 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2718                          "iocqes %u",
2719                         __func__,
2720                          NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2721                          NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2722                          NVME_CC_GET_IOCQES(ccreg));
2723
2724                 if (NVME_CC_GET_SHN(ccreg)) {
2725                         /* perform shutdown - flush out data to backend */
2726                         sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2727                             NVME_CSTS_REG_SHST_SHIFT);
2728                         sc->regs.csts |= NVME_SHST_COMPLETE <<
2729                             NVME_CSTS_REG_SHST_SHIFT;
2730                 }
2731                 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2732                         if (NVME_CC_GET_EN(ccreg) == 0)
2733                                 /* transition 1-> causes controller reset */
2734                                 pci_nvme_reset_locked(sc);
2735                         else
2736                                 pci_nvme_init_controller(ctx, sc);
2737                 }
2738
2739                 /* Insert the iocqes, iosqes and en bits from the write */
2740                 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2741                 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2742                 if (NVME_CC_GET_EN(ccreg) == 0) {
2743                         /* Insert the ams, mps and css bit fields */
2744                         sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2745                         sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2746                         sc->regs.csts &= ~NVME_CSTS_RDY;
2747                 } else if (sc->pending_ios == 0) {
2748                         sc->regs.csts |= NVME_CSTS_RDY;
2749                 }
2750                 break;
2751         case NVME_CR_CSTS:
2752                 break;
2753         case NVME_CR_NSSR:
2754                 /* ignore writes; don't support subsystem reset */
2755                 break;
2756         case NVME_CR_AQA:
2757                 sc->regs.aqa = (uint32_t)value;
2758                 break;
2759         case NVME_CR_ASQ_LOW:
2760                 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2761                                (0xFFFFF000 & value);
2762                 break;
2763         case NVME_CR_ASQ_HI:
2764                 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2765                                (value << 32);
2766                 break;
2767         case NVME_CR_ACQ_LOW:
2768                 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2769                                (0xFFFFF000 & value);
2770                 break;
2771         case NVME_CR_ACQ_HI:
2772                 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2773                                (value << 32);
2774                 break;
2775         default:
2776                 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2777                          __func__, offset, value, size);
2778         }
2779         pthread_mutex_unlock(&sc->mtx);
2780 }
2781
2782 static void
2783 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2784                 int baridx, uint64_t offset, int size, uint64_t value)
2785 {
2786         struct pci_nvme_softc* sc = pi->pi_arg;
2787
2788         if (baridx == pci_msix_table_bar(pi) ||
2789             baridx == pci_msix_pba_bar(pi)) {
2790                 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2791                          " value 0x%lx", baridx, offset, size, value);
2792
2793                 pci_emul_msix_twrite(pi, offset, size, value);
2794                 return;
2795         }
2796
2797         switch (baridx) {
2798         case 0:
2799                 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2800                 break;
2801
2802         default:
2803                 DPRINTF("%s unknown baridx %d, val 0x%lx",
2804                          __func__, baridx, value);
2805         }
2806 }
2807
2808 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2809         uint64_t offset, int size)
2810 {
2811         uint64_t value;
2812
2813         pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2814
2815         if (offset < NVME_DOORBELL_OFFSET) {
2816                 void *p = &(sc->regs);
2817                 pthread_mutex_lock(&sc->mtx);
2818                 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2819                 pthread_mutex_unlock(&sc->mtx);
2820         } else {
2821                 value = 0;
2822                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2823         }
2824
2825         switch (size) {
2826         case 1:
2827                 value &= 0xFF;
2828                 break;
2829         case 2:
2830                 value &= 0xFFFF;
2831                 break;
2832         case 4:
2833                 value &= 0xFFFFFFFF;
2834                 break;
2835         }
2836
2837         DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
2838                  offset, size, (uint32_t)value);
2839
2840         return (value);
2841 }
2842
2843
2844
2845 static uint64_t
2846 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2847     uint64_t offset, int size)
2848 {
2849         struct pci_nvme_softc* sc = pi->pi_arg;
2850
2851         if (baridx == pci_msix_table_bar(pi) ||
2852             baridx == pci_msix_pba_bar(pi)) {
2853                 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2854                         baridx, offset, size);
2855
2856                 return pci_emul_msix_tread(pi, offset, size);
2857         }
2858
2859         switch (baridx) {
2860         case 0:
2861                 return pci_nvme_read_bar_0(sc, offset, size);
2862
2863         default:
2864                 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2865         }
2866
2867         return (0);
2868 }
2869
2870 static int
2871 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl)
2872 {
2873         char bident[sizeof("XX:X:X")];
2874         const char *value;
2875         uint32_t sectsz;
2876
2877         sc->max_queues = NVME_QUEUES;
2878         sc->max_qentries = NVME_MAX_QENTRIES;
2879         sc->ioslots = NVME_IOSLOTS;
2880         sc->num_squeues = sc->max_queues;
2881         sc->num_cqueues = sc->max_queues;
2882         sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2883         sectsz = 0;
2884         snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2885                  "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2886
2887         value = get_config_value_node(nvl, "maxq");
2888         if (value != NULL)
2889                 sc->max_queues = atoi(value);
2890         value = get_config_value_node(nvl, "qsz");
2891         if (value != NULL) {
2892                 sc->max_qentries = atoi(value);
2893                 if (sc->max_qentries <= 0) {
2894                         EPRINTLN("nvme: Invalid qsz option %d",
2895                             sc->max_qentries);
2896                         return (-1);
2897                 }
2898         }
2899         value = get_config_value_node(nvl, "ioslots");
2900         if (value != NULL) {
2901                 sc->ioslots = atoi(value);
2902                 if (sc->ioslots <= 0) {
2903                         EPRINTLN("Invalid ioslots option %d", sc->ioslots);
2904                         return (-1);
2905                 }
2906         }
2907         value = get_config_value_node(nvl, "sectsz");
2908         if (value != NULL)
2909                 sectsz = atoi(value);
2910         value = get_config_value_node(nvl, "ser");
2911         if (value != NULL) {
2912                 /*
2913                  * This field indicates the Product Serial Number in
2914                  * 7-bit ASCII, unused bytes should be space characters.
2915                  * Ref: NVMe v1.3c.
2916                  */
2917                 cpywithpad((char *)sc->ctrldata.sn,
2918                     sizeof(sc->ctrldata.sn), value, ' ');
2919         }
2920         value = get_config_value_node(nvl, "eui64");
2921         if (value != NULL)
2922                 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0));
2923         value = get_config_value_node(nvl, "dsm");
2924         if (value != NULL) {
2925                 if (strcmp(value, "auto") == 0)
2926                         sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2927                 else if (strcmp(value, "enable") == 0)
2928                         sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2929                 else if (strcmp(value, "disable") == 0)
2930                         sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2931         }
2932
2933         value = get_config_value_node(nvl, "ram");
2934         if (value != NULL) {
2935                 uint64_t sz = strtoull(value, NULL, 10);
2936
2937                 sc->nvstore.type = NVME_STOR_RAM;
2938                 sc->nvstore.size = sz * 1024 * 1024;
2939                 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2940                 sc->nvstore.sectsz = 4096;
2941                 sc->nvstore.sectsz_bits = 12;
2942                 if (sc->nvstore.ctx == NULL) {
2943                         EPRINTLN("nvme: Unable to allocate RAM");
2944                         return (-1);
2945                 }
2946         } else {
2947                 snprintf(bident, sizeof(bident), "%d:%d",
2948                     sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2949                 sc->nvstore.ctx = blockif_open(nvl, bident);
2950                 if (sc->nvstore.ctx == NULL) {
2951                         EPRINTLN("nvme: Could not open backing file: %s",
2952                             strerror(errno));
2953                         return (-1);
2954                 }
2955                 sc->nvstore.type = NVME_STOR_BLOCKIF;
2956                 sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2957         }
2958
2959         if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2960                 sc->nvstore.sectsz = sectsz;
2961         else if (sc->nvstore.type != NVME_STOR_RAM)
2962                 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2963         for (sc->nvstore.sectsz_bits = 9;
2964              (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2965              sc->nvstore.sectsz_bits++);
2966
2967         if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2968                 sc->max_queues = NVME_QUEUES;
2969
2970         return (0);
2971 }
2972
2973 static int
2974 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
2975 {
2976         struct pci_nvme_softc *sc;
2977         uint32_t pci_membar_sz;
2978         int     error;
2979
2980         error = 0;
2981
2982         sc = calloc(1, sizeof(struct pci_nvme_softc));
2983         pi->pi_arg = sc;
2984         sc->nsc_pi = pi;
2985
2986         error = pci_nvme_parse_config(sc, nvl);
2987         if (error < 0)
2988                 goto done;
2989         else
2990                 error = 0;
2991
2992         STAILQ_INIT(&sc->ioreqs_free);
2993         sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2994         for (int i = 0; i < sc->ioslots; i++) {
2995                 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2996         }
2997
2998         pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2999         pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
3000         pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
3001         pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
3002         pci_set_cfgdata8(pi, PCIR_PROGIF,
3003                          PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
3004
3005         /*
3006          * Allocate size of NVMe registers + doorbell space for all queues.
3007          *
3008          * The specification requires a minimum memory I/O window size of 16K.
3009          * The Windows driver will refuse to start a device with a smaller
3010          * window.
3011          */
3012         pci_membar_sz = sizeof(struct nvme_registers) +
3013             2 * sizeof(uint32_t) * (sc->max_queues + 1);
3014         pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
3015
3016         DPRINTF("nvme membar size: %u", pci_membar_sz);
3017
3018         error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
3019         if (error) {
3020                 WPRINTF("%s pci alloc mem bar failed", __func__);
3021                 goto done;
3022         }
3023
3024         error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
3025         if (error) {
3026                 WPRINTF("%s pci add msixcap failed", __func__);
3027                 goto done;
3028         }
3029
3030         error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
3031         if (error) {
3032                 WPRINTF("%s pci add Express capability failed", __func__);
3033                 goto done;
3034         }
3035
3036         pthread_mutex_init(&sc->mtx, NULL);
3037         sem_init(&sc->iosemlock, 0, sc->ioslots);
3038
3039         pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
3040         /*
3041          * Controller data depends on Namespace data so initialize Namespace
3042          * data first.
3043          */
3044         pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
3045         pci_nvme_init_ctrldata(sc);
3046         pci_nvme_init_logpages(sc);
3047         pci_nvme_init_features(sc);
3048
3049         pci_nvme_aer_init(sc);
3050         pci_nvme_aen_init(sc);
3051
3052         pci_nvme_reset(sc);
3053
3054         pci_lintr_request(pi);
3055
3056 done:
3057         return (error);
3058 }
3059
3060 static int
3061 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts)
3062 {
3063         char *cp, *ram;
3064
3065         if (opts == NULL)
3066                 return (0);
3067
3068         if (strncmp(opts, "ram=", 4) == 0) {
3069                 cp = strchr(opts, ',');
3070                 if (cp == NULL) {
3071                         set_config_value_node(nvl, "ram", opts + 4);
3072                         return (0);
3073                 }
3074                 ram = strndup(opts + 4, cp - opts - 4);
3075                 set_config_value_node(nvl, "ram", ram);
3076                 free(ram);
3077                 return (pci_parse_legacy_config(nvl, cp + 1));
3078         } else
3079                 return (blockif_legacy_config(nvl, opts));
3080 }
3081
3082 struct pci_devemu pci_de_nvme = {
3083         .pe_emu =       "nvme",
3084         .pe_init =      pci_nvme_init,
3085         .pe_legacy_config = pci_nvme_legacy_config,
3086         .pe_barwrite =  pci_nvme_write,
3087         .pe_barread =   pci_nvme_read
3088 };
3089 PCI_EMUL_SET(pci_de_nvme);