]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/bhyve/pci_nvme.c
bhyve nvme: Implement Log Page Offset
[FreeBSD/FreeBSD.git] / usr.sbin / bhyve / pci_nvme.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov 
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65
66 #include <assert.h>
67 #include <pthread.h>
68 #include <pthread_np.h>
69 #include <semaphore.h>
70 #include <stdbool.h>
71 #include <stddef.h>
72 #include <stdint.h>
73 #include <stdio.h>
74 #include <stdlib.h>
75 #include <string.h>
76
77 #include <machine/atomic.h>
78 #include <machine/vmm.h>
79 #include <vmmapi.h>
80
81 #include <dev/nvme/nvme.h>
82
83 #include "bhyverun.h"
84 #include "block_if.h"
85 #include "config.h"
86 #include "debug.h"
87 #include "pci_emul.h"
88
89
90 static int nvme_debug = 0;
91 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
92 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
93
94 /* defaults; can be overridden */
95 #define NVME_MSIX_BAR           4
96
97 #define NVME_IOSLOTS            8
98
99 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
100 #define NVME_MMIO_SPACE_MIN     (1 << 14)
101
102 #define NVME_QUEUES             16
103 #define NVME_MAX_QENTRIES       2048
104 /* Memory Page size Minimum reported in CAP register */
105 #define NVME_MPSMIN             0
106 /* MPSMIN converted to bytes */
107 #define NVME_MPSMIN_BYTES       (1 << (12 + NVME_MPSMIN))
108
109 #define NVME_PRP2_ITEMS         (PAGE_SIZE/sizeof(uint64_t))
110 #define NVME_MDTS               9
111 /* Note the + 1 allows for the initial descriptor to not be page aligned */
112 #define NVME_MAX_IOVEC          ((1 << NVME_MDTS) + 1)
113 #define NVME_MAX_DATA_SIZE      ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
114
115 /* This is a synthetic status code to indicate there is no status */
116 #define NVME_NO_STATUS          0xffff
117 #define NVME_COMPLETION_VALID(c)        ((c).status != NVME_NO_STATUS)
118
119 /* helpers */
120
121 /* Convert a zero-based value into a one-based value */
122 #define ONE_BASED(zero)         ((zero) + 1)
123 /* Convert a one-based value into a zero-based value */
124 #define ZERO_BASED(one)         ((one)  - 1)
125
126 /* Encode number of SQ's and CQ's for Set/Get Features */
127 #define NVME_FEATURE_NUM_QUEUES(sc) \
128         (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
129         (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
130
131 #define NVME_DOORBELL_OFFSET    offsetof(struct nvme_registers, doorbell)
132
133 enum nvme_controller_register_offsets {
134         NVME_CR_CAP_LOW = 0x00,
135         NVME_CR_CAP_HI  = 0x04,
136         NVME_CR_VS      = 0x08,
137         NVME_CR_INTMS   = 0x0c,
138         NVME_CR_INTMC   = 0x10,
139         NVME_CR_CC      = 0x14,
140         NVME_CR_CSTS    = 0x1c,
141         NVME_CR_NSSR    = 0x20,
142         NVME_CR_AQA     = 0x24,
143         NVME_CR_ASQ_LOW = 0x28,
144         NVME_CR_ASQ_HI  = 0x2c,
145         NVME_CR_ACQ_LOW = 0x30,
146         NVME_CR_ACQ_HI  = 0x34,
147 };
148
149 enum nvme_cmd_cdw11 {
150         NVME_CMD_CDW11_PC  = 0x0001,
151         NVME_CMD_CDW11_IEN = 0x0002,
152         NVME_CMD_CDW11_IV  = 0xFFFF0000,
153 };
154
155 enum nvme_copy_dir {
156         NVME_COPY_TO_PRP,
157         NVME_COPY_FROM_PRP,
158 };
159
160 #define NVME_CQ_INTEN   0x01
161 #define NVME_CQ_INTCOAL 0x02
162
163 struct nvme_completion_queue {
164         struct nvme_completion *qbase;
165         pthread_mutex_t mtx;
166         uint32_t        size;
167         uint16_t        tail; /* nvme progress */
168         uint16_t        head; /* guest progress */
169         uint16_t        intr_vec;
170         uint32_t        intr_en;
171 };
172
173 struct nvme_submission_queue {
174         struct nvme_command *qbase;
175         pthread_mutex_t mtx;
176         uint32_t        size;
177         uint16_t        head; /* nvme progress */
178         uint16_t        tail; /* guest progress */
179         uint16_t        cqid; /* completion queue id */
180         int             qpriority;
181 };
182
183 enum nvme_storage_type {
184         NVME_STOR_BLOCKIF = 0,
185         NVME_STOR_RAM = 1,
186 };
187
188 struct pci_nvme_blockstore {
189         enum nvme_storage_type type;
190         void            *ctx;
191         uint64_t        size;
192         uint32_t        sectsz;
193         uint32_t        sectsz_bits;
194         uint64_t        eui64;
195         uint32_t        deallocate:1;
196 };
197
198 /*
199  * Calculate the number of additional page descriptors for guest IO requests
200  * based on the advertised Max Data Transfer (MDTS) and given the number of
201  * default iovec's in a struct blockif_req.
202  */
203 #define MDTS_PAD_SIZE \
204         ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
205           NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
206           0 )
207
208 struct pci_nvme_ioreq {
209         struct pci_nvme_softc *sc;
210         STAILQ_ENTRY(pci_nvme_ioreq) link;
211         struct nvme_submission_queue *nvme_sq;
212         uint16_t        sqid;
213
214         /* command information */
215         uint16_t        opc;
216         uint16_t        cid;
217         uint32_t        nsid;
218
219         uint64_t        prev_gpaddr;
220         size_t          prev_size;
221         size_t          bytes;
222
223         struct blockif_req io_req;
224
225         struct iovec    iovpadding[MDTS_PAD_SIZE];
226 };
227
228 enum nvme_dsm_type {
229         /* Dataset Management bit in ONCS reflects backing storage capability */
230         NVME_DATASET_MANAGEMENT_AUTO,
231         /* Unconditionally set Dataset Management bit in ONCS */
232         NVME_DATASET_MANAGEMENT_ENABLE,
233         /* Unconditionally clear Dataset Management bit in ONCS */
234         NVME_DATASET_MANAGEMENT_DISABLE,
235 };
236
237 struct pci_nvme_softc;
238 struct nvme_feature_obj;
239
240 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
241     struct nvme_feature_obj *,
242     struct nvme_command *,
243     struct nvme_completion *);
244
245 struct nvme_feature_obj {
246         uint32_t        cdw11;
247         nvme_feature_cb set;
248         nvme_feature_cb get;
249         bool namespace_specific;
250 };
251
252 #define NVME_FID_MAX            (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
253
254 typedef enum {
255         PCI_NVME_AE_TYPE_ERROR = 0,
256         PCI_NVME_AE_TYPE_SMART,
257         PCI_NVME_AE_TYPE_NOTICE,
258         PCI_NVME_AE_TYPE_IO_CMD = 6,
259         PCI_NVME_AE_TYPE_VENDOR = 7,
260         PCI_NVME_AE_TYPE_MAX            /* Must be last */
261 } pci_nvme_async_type;
262
263 /* Asynchronous Event Requests */
264 struct pci_nvme_aer {
265         STAILQ_ENTRY(pci_nvme_aer) link;
266         uint16_t        cid;    /* Command ID of the submitted AER */
267 };
268
269 typedef enum {
270         PCI_NVME_AE_INFO_NS_ATTR_CHANGED = 0,
271         PCI_NVME_AE_INFO_FW_ACTIVATION,
272         PCI_NVME_AE_INFO_TELEMETRY_CHANGE,
273         PCI_NVME_AE_INFO_ANA_CHANGE,
274         PCI_NVME_AE_INFO_PREDICT_LATENCY_CHANGE,
275         PCI_NVME_AE_INFO_LBA_STATUS_ALERT,
276         PCI_NVME_AE_INFO_ENDURANCE_GROUP_CHANGE,
277         PCI_NVME_AE_INFO_MAX,
278 } pci_nvme_async_info;
279
280 /* Asynchronous Event Notifications */
281 struct pci_nvme_aen {
282         pci_nvme_async_type atype;
283         uint32_t        event_data;
284         bool            posted;
285 };
286
287 struct pci_nvme_softc {
288         struct pci_devinst *nsc_pi;
289
290         pthread_mutex_t mtx;
291
292         struct nvme_registers regs;
293
294         struct nvme_namespace_data  nsdata;
295         struct nvme_controller_data ctrldata;
296         struct nvme_error_information_entry err_log;
297         struct nvme_health_information_page health_log;
298         struct nvme_firmware_page fw_log;
299         struct nvme_ns_list ns_log;
300
301         struct pci_nvme_blockstore nvstore;
302
303         uint16_t        max_qentries;   /* max entries per queue */
304         uint32_t        max_queues;     /* max number of IO SQ's or CQ's */
305         uint32_t        num_cqueues;
306         uint32_t        num_squeues;
307         bool            num_q_is_set; /* Has host set Number of Queues */
308
309         struct pci_nvme_ioreq *ioreqs;
310         STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
311         uint32_t        pending_ios;
312         uint32_t        ioslots;
313         sem_t           iosemlock;
314
315         /*
316          * Memory mapped Submission and Completion queues
317          * Each array includes both Admin and IO queues
318          */
319         struct nvme_completion_queue *compl_queues;
320         struct nvme_submission_queue *submit_queues;
321
322         struct nvme_feature_obj feat[NVME_FID_MAX];
323
324         enum nvme_dsm_type dataset_management;
325
326         /* Accounting for SMART data */
327         __uint128_t     read_data_units;
328         __uint128_t     write_data_units;
329         __uint128_t     read_commands;
330         __uint128_t     write_commands;
331         uint32_t        read_dunits_remainder;
332         uint32_t        write_dunits_remainder;
333
334         STAILQ_HEAD(, pci_nvme_aer) aer_list;
335         pthread_mutex_t aer_mtx;
336         uint32_t        aer_count;
337         struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX];
338         pthread_t       aen_tid;
339         pthread_mutex_t aen_mtx;
340         pthread_cond_t  aen_cond;
341 };
342
343
344 static void pci_nvme_cq_update(struct pci_nvme_softc *sc,
345     struct nvme_completion_queue *cq,
346     uint32_t cdw0,
347     uint16_t cid,
348     uint16_t sqid,
349     uint16_t status);
350 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
351 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
352 static void pci_nvme_io_done(struct blockif_req *, int);
353
354 /* Controller Configuration utils */
355 #define NVME_CC_GET_EN(cc) \
356         ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
357 #define NVME_CC_GET_CSS(cc) \
358         ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
359 #define NVME_CC_GET_SHN(cc) \
360         ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
361 #define NVME_CC_GET_IOSQES(cc) \
362         ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
363 #define NVME_CC_GET_IOCQES(cc) \
364         ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
365
366 #define NVME_CC_WRITE_MASK \
367         ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
368          (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
369          (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
370
371 #define NVME_CC_NEN_WRITE_MASK \
372         ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
373          (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
374          (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
375
376 /* Controller Status utils */
377 #define NVME_CSTS_GET_RDY(sts) \
378         ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
379
380 #define NVME_CSTS_RDY   (1 << NVME_CSTS_REG_RDY_SHIFT)
381
382 /* Completion Queue status word utils */
383 #define NVME_STATUS_P   (1 << NVME_STATUS_P_SHIFT)
384 #define NVME_STATUS_MASK \
385         ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
386          (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
387
388 #define NVME_ONCS_DSM   (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
389         NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
390
391 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
392     struct nvme_feature_obj *,
393     struct nvme_command *,
394     struct nvme_completion *);
395 static void nvme_feature_num_queues(struct pci_nvme_softc *,
396     struct nvme_feature_obj *,
397     struct nvme_command *,
398     struct nvme_completion *);
399 static void nvme_feature_iv_config(struct pci_nvme_softc *,
400     struct nvme_feature_obj *,
401     struct nvme_command *,
402     struct nvme_completion *);
403
404 static void *aen_thr(void *arg);
405
406 static __inline void
407 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
408 {
409         size_t len;
410
411         len = strnlen(src, dst_size);
412         memset(dst, pad, dst_size);
413         memcpy(dst, src, len);
414 }
415
416 static __inline void
417 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
418 {
419
420         *status &= ~NVME_STATUS_MASK;
421         *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
422                 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
423 }
424
425 static __inline void
426 pci_nvme_status_genc(uint16_t *status, uint16_t code)
427 {
428
429         pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
430 }
431
432 /*
433  * Initialize the requested number or IO Submission and Completion Queues.
434  * Admin queues are allocated implicitly.
435  */
436 static void
437 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
438 {
439         uint32_t i;
440
441         /*
442          * Allocate and initialize the Submission Queues
443          */
444         if (nsq > NVME_QUEUES) {
445                 WPRINTF("%s: clamping number of SQ from %u to %u",
446                                         __func__, nsq, NVME_QUEUES);
447                 nsq = NVME_QUEUES;
448         }
449
450         sc->num_squeues = nsq;
451
452         sc->submit_queues = calloc(sc->num_squeues + 1,
453                                 sizeof(struct nvme_submission_queue));
454         if (sc->submit_queues == NULL) {
455                 WPRINTF("%s: SQ allocation failed", __func__);
456                 sc->num_squeues = 0;
457         } else {
458                 struct nvme_submission_queue *sq = sc->submit_queues;
459
460                 for (i = 0; i < sc->num_squeues; i++)
461                         pthread_mutex_init(&sq[i].mtx, NULL);
462         }
463
464         /*
465          * Allocate and initialize the Completion Queues
466          */
467         if (ncq > NVME_QUEUES) {
468                 WPRINTF("%s: clamping number of CQ from %u to %u",
469                                         __func__, ncq, NVME_QUEUES);
470                 ncq = NVME_QUEUES;
471         }
472
473         sc->num_cqueues = ncq;
474
475         sc->compl_queues = calloc(sc->num_cqueues + 1,
476                                 sizeof(struct nvme_completion_queue));
477         if (sc->compl_queues == NULL) {
478                 WPRINTF("%s: CQ allocation failed", __func__);
479                 sc->num_cqueues = 0;
480         } else {
481                 struct nvme_completion_queue *cq = sc->compl_queues;
482
483                 for (i = 0; i < sc->num_cqueues; i++)
484                         pthread_mutex_init(&cq[i].mtx, NULL);
485         }
486 }
487
488 static void
489 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
490 {
491         struct nvme_controller_data *cd = &sc->ctrldata;
492
493         cd->vid = 0xFB5D;
494         cd->ssvid = 0x0000;
495
496         cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
497         cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
498
499         /* Num of submission commands that we can handle at a time (2^rab) */
500         cd->rab   = 4;
501
502         /* FreeBSD OUI */
503         cd->ieee[0] = 0x58;
504         cd->ieee[1] = 0x9c;
505         cd->ieee[2] = 0xfc;
506
507         cd->mic = 0;
508
509         cd->mdts = NVME_MDTS;   /* max data transfer size (2^mdts * CAP.MPSMIN) */
510
511         cd->ver = NVME_REV(1,4);
512
513         cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
514         cd->acl = 2;
515         cd->aerl = 4;
516
517         /* Advertise 1, Read-only firmware slot */
518         cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
519             (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
520         cd->lpa = 0;    /* TODO: support some simple things like SMART */
521         cd->elpe = 0;   /* max error log page entries */
522         cd->npss = 1;   /* number of power states support */
523
524         /* Warning Composite Temperature Threshold */
525         cd->wctemp = 0x0157;
526
527         cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
528             (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
529         cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
530             (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
531         cd->nn = 1;     /* number of namespaces */
532
533         cd->oncs = 0;
534         switch (sc->dataset_management) {
535         case NVME_DATASET_MANAGEMENT_AUTO:
536                 if (sc->nvstore.deallocate)
537                         cd->oncs |= NVME_ONCS_DSM;
538                 break;
539         case NVME_DATASET_MANAGEMENT_ENABLE:
540                 cd->oncs |= NVME_ONCS_DSM;
541                 break;
542         default:
543                 break;
544         }
545
546         cd->fna = 0x03;
547
548         cd->power_state[0].mp = 10;
549 }
550
551 /*
552  * Calculate the CRC-16 of the given buffer
553  * See copyright attribution at top of file
554  */
555 static uint16_t
556 crc16(uint16_t crc, const void *buffer, unsigned int len)
557 {
558         const unsigned char *cp = buffer;
559         /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
560         static uint16_t const crc16_table[256] = {
561                 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
562                 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
563                 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
564                 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
565                 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
566                 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
567                 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
568                 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
569                 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
570                 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
571                 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
572                 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
573                 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
574                 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
575                 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
576                 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
577                 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
578                 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
579                 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
580                 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
581                 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
582                 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
583                 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
584                 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
585                 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
586                 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
587                 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
588                 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
589                 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
590                 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
591                 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
592                 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
593         };
594
595         while (len--)
596                 crc = (((crc >> 8) & 0xffU) ^
597                     crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
598         return crc;
599 }
600
601 static void
602 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore,
603     struct nvme_namespace_data *nd)
604 {
605
606         /* Get capacity and block size information from backing store */
607         nd->nsze = nvstore->size / nvstore->sectsz;
608         nd->ncap = nd->nsze;
609         nd->nuse = nd->nsze;
610 }
611
612 static void
613 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
614     struct nvme_namespace_data *nd, uint32_t nsid,
615     struct pci_nvme_blockstore *nvstore)
616 {
617
618         pci_nvme_init_nsdata_size(nvstore, nd);
619
620         if (nvstore->type == NVME_STOR_BLOCKIF)
621                 nvstore->deallocate = blockif_candelete(nvstore->ctx);
622
623         nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
624         nd->flbas = 0;
625
626         /* Create an EUI-64 if user did not provide one */
627         if (nvstore->eui64 == 0) {
628                 char *data = NULL;
629                 uint64_t eui64 = nvstore->eui64;
630
631                 asprintf(&data, "%s%u%u%u", get_config_value("name"),
632                     sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot,
633                     sc->nsc_pi->pi_func);
634
635                 if (data != NULL) {
636                         eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
637                         free(data);
638                 }
639                 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
640         }
641         be64enc(nd->eui64, nvstore->eui64);
642
643         /* LBA data-sz = 2^lbads */
644         nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
645 }
646
647 static void
648 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
649 {
650
651         memset(&sc->err_log, 0, sizeof(sc->err_log));
652         memset(&sc->health_log, 0, sizeof(sc->health_log));
653         memset(&sc->fw_log, 0, sizeof(sc->fw_log));
654         memset(&sc->ns_log, 0, sizeof(sc->ns_log));
655
656         /* Set read/write remainder to round up according to spec */
657         sc->read_dunits_remainder = 999;
658         sc->write_dunits_remainder = 999;
659
660         /* Set nominal Health values checked by implementations */
661         sc->health_log.temperature = 310;
662         sc->health_log.available_spare = 100;
663         sc->health_log.available_spare_threshold = 10;
664 }
665
666 static void
667 pci_nvme_init_features(struct pci_nvme_softc *sc)
668 {
669
670         sc->feat[0].set = nvme_feature_invalid_cb;
671         sc->feat[0].get = nvme_feature_invalid_cb;
672
673         sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
674         sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
675         sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
676         sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set =
677             nvme_feature_iv_config;
678         /* Enable all AENs by default */
679         sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 = 0x31f;
680         sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get =
681             nvme_feature_invalid_cb;
682         sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get =
683             nvme_feature_invalid_cb;
684 }
685
686 static void
687 pci_nvme_aer_reset(struct pci_nvme_softc *sc)
688 {
689
690         STAILQ_INIT(&sc->aer_list);
691         sc->aer_count = 0;
692 }
693
694 static void
695 pci_nvme_aer_init(struct pci_nvme_softc *sc)
696 {
697
698         pthread_mutex_init(&sc->aer_mtx, NULL);
699         pci_nvme_aer_reset(sc);
700 }
701
702 static void
703 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
704 {
705         struct pci_nvme_aer *aer = NULL;
706
707         pthread_mutex_lock(&sc->aer_mtx);
708         while (!STAILQ_EMPTY(&sc->aer_list)) {
709                 aer = STAILQ_FIRST(&sc->aer_list);
710                 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
711                 free(aer);
712         }
713         pthread_mutex_unlock(&sc->aer_mtx);
714
715         pci_nvme_aer_reset(sc);
716 }
717
718 static bool
719 pci_nvme_aer_available(struct pci_nvme_softc *sc)
720 {
721
722         return (sc->aer_count != 0);
723 }
724
725 static bool
726 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
727 {
728         struct nvme_controller_data *cd = &sc->ctrldata;
729
730         /* AERL is a zero based value while aer_count is one's based */
731         return (sc->aer_count == (cd->aerl + 1));
732 }
733
734 /*
735  * Add an Async Event Request
736  *
737  * Stores an AER to be returned later if the Controller needs to notify the
738  * host of an event.
739  * Note that while the NVMe spec doesn't require Controllers to return AER's
740  * in order, this implementation does preserve the order.
741  */
742 static int
743 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
744 {
745         struct pci_nvme_aer *aer = NULL;
746
747         if (pci_nvme_aer_limit_reached(sc))
748                 return (-1);
749
750         aer = calloc(1, sizeof(struct pci_nvme_aer));
751         if (aer == NULL)
752                 return (-1);
753
754         /* Save the Command ID for use in the completion message */
755         aer->cid = cid;
756
757         pthread_mutex_lock(&sc->aer_mtx);
758         sc->aer_count++;
759         STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
760         pthread_mutex_unlock(&sc->aer_mtx);
761
762         return (0);
763 }
764
765 /*
766  * Get an Async Event Request structure
767  *
768  * Returns a pointer to an AER previously submitted by the host or NULL if
769  * no AER's exist. Caller is responsible for freeing the returned struct.
770  */
771 static struct pci_nvme_aer *
772 pci_nvme_aer_get(struct pci_nvme_softc *sc)
773 {
774         struct pci_nvme_aer *aer = NULL;
775
776         pthread_mutex_lock(&sc->aer_mtx);
777         aer = STAILQ_FIRST(&sc->aer_list);
778         if (aer != NULL) {
779                 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
780                 sc->aer_count--;
781         }
782         pthread_mutex_unlock(&sc->aer_mtx);
783         
784         return (aer);
785 }
786
787 static void
788 pci_nvme_aen_reset(struct pci_nvme_softc *sc)
789 {
790         uint32_t        atype;
791
792         memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen));
793
794         for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
795                 sc->aen[atype].atype = atype;
796         }
797 }
798
799 static void
800 pci_nvme_aen_init(struct pci_nvme_softc *sc)
801 {
802         char nstr[80];
803
804         pci_nvme_aen_reset(sc);
805
806         pthread_mutex_init(&sc->aen_mtx, NULL);
807         pthread_create(&sc->aen_tid, NULL, aen_thr, sc);
808         snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot,
809             sc->nsc_pi->pi_func);
810         pthread_set_name_np(sc->aen_tid, nstr);
811 }
812
813 static void
814 pci_nvme_aen_destroy(struct pci_nvme_softc *sc)
815 {
816
817         pci_nvme_aen_reset(sc);
818 }
819
820 /* Notify the AEN thread of pending work */
821 static void
822 pci_nvme_aen_notify(struct pci_nvme_softc *sc)
823 {
824
825         pthread_cond_signal(&sc->aen_cond);
826 }
827
828 /*
829  * Post an Asynchronous Event Notification
830  */
831 static int32_t
832 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype,
833                 uint32_t event_data)
834 {
835         struct pci_nvme_aen *aen;
836
837         if (atype >= PCI_NVME_AE_TYPE_MAX) {
838                 return(EINVAL);
839         }
840
841         pthread_mutex_lock(&sc->aen_mtx);
842         aen = &sc->aen[atype];
843
844         /* Has the controller already posted an event of this type? */
845         if (aen->posted) {
846                 pthread_mutex_unlock(&sc->aen_mtx);
847                 return(EALREADY);
848         }
849
850         aen->event_data = event_data;
851         aen->posted = true;
852         pthread_mutex_unlock(&sc->aen_mtx);
853
854         pci_nvme_aen_notify(sc);
855
856         return(0);
857 }
858
859 static void
860 pci_nvme_aen_process(struct pci_nvme_softc *sc)
861 {
862         struct pci_nvme_aer *aer;
863         struct pci_nvme_aen *aen;
864         pci_nvme_async_type atype;
865         uint32_t mask;
866         uint16_t status;
867         uint8_t lid;
868
869         assert(pthread_mutex_isowned_np(&sc->aen_mtx));
870         for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
871                 aen = &sc->aen[atype];
872                 /* Previous iterations may have depleted the available AER's */
873                 if (!pci_nvme_aer_available(sc)) {
874                         DPRINTF("%s: no AER", __func__);
875                         break;
876                 }
877
878                 if (!aen->posted) {
879                         DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype);
880                         continue;
881                 }
882
883                 status = NVME_SC_SUCCESS;
884
885                 /* Is the event masked? */
886                 mask =
887                     sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11;
888
889                 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data);
890                 switch (atype) {
891                 case PCI_NVME_AE_TYPE_ERROR:
892                         lid = NVME_LOG_ERROR;
893                         break;
894                 case PCI_NVME_AE_TYPE_SMART:
895                         mask &= 0xff;
896                         if ((mask & aen->event_data) == 0)
897                                 continue;
898                         lid = NVME_LOG_HEALTH_INFORMATION;
899                         break;
900                 case PCI_NVME_AE_TYPE_NOTICE:
901                         if (aen->event_data >= PCI_NVME_AE_INFO_MAX) {
902                                 EPRINTLN("%s unknown AEN notice type %u",
903                                     __func__, aen->event_data);
904                                 status = NVME_SC_INTERNAL_DEVICE_ERROR;
905                                 break;
906                         }
907                         mask >>= 8;
908                         if (((1 << aen->event_data) & mask) == 0)
909                                 continue;
910                         switch (aen->event_data) {
911                         case PCI_NVME_AE_INFO_NS_ATTR_CHANGED:
912                                 lid = NVME_LOG_CHANGED_NAMESPACE;
913                                 break;
914                         case PCI_NVME_AE_INFO_FW_ACTIVATION:
915                                 lid = NVME_LOG_FIRMWARE_SLOT;
916                                 break;
917                         case PCI_NVME_AE_INFO_TELEMETRY_CHANGE:
918                                 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED;
919                                 break;
920                         case PCI_NVME_AE_INFO_ANA_CHANGE:
921                                 lid = NVME_LOG_ASYMMETRIC_NAMESPAVE_ACCESS; //TODO spelling
922                                 break;
923                         case PCI_NVME_AE_INFO_PREDICT_LATENCY_CHANGE:
924                                 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE;
925                                 break;
926                         case PCI_NVME_AE_INFO_LBA_STATUS_ALERT:
927                                 lid = NVME_LOG_LBA_STATUS_INFORMATION;
928                                 break;
929                         case PCI_NVME_AE_INFO_ENDURANCE_GROUP_CHANGE:
930                                 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE;
931                                 break;
932                         default:
933                                 lid = 0;
934                         }
935                         break;
936                 default:
937                         /* bad type?!? */
938                         EPRINTLN("%s unknown AEN type %u", __func__, atype);
939                         status = NVME_SC_INTERNAL_DEVICE_ERROR;
940                         break;
941                 }
942
943                 aer = pci_nvme_aer_get(sc);
944                 assert(aer != NULL);
945
946                 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype);
947                 pci_nvme_cq_update(sc, &sc->compl_queues[0],
948                     (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */
949                     aer->cid,
950                     0,          /* SQID */
951                     status);
952
953                 aen->event_data = 0;
954                 aen->posted = false;
955
956                 pci_generate_msix(sc->nsc_pi, 0);
957         }
958 }
959
960 static void *
961 aen_thr(void *arg)
962 {
963         struct pci_nvme_softc *sc;
964
965         sc = arg;
966
967         pthread_mutex_lock(&sc->aen_mtx);
968         for (;;) {
969                 pci_nvme_aen_process(sc);
970                 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx);
971         }
972         pthread_mutex_unlock(&sc->aen_mtx);
973
974         pthread_exit(NULL);
975         return (NULL);
976 }
977
978 static void
979 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
980 {
981         uint32_t i;
982
983         DPRINTF("%s", __func__);
984
985         sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
986             (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
987             (60 << NVME_CAP_LO_REG_TO_SHIFT);
988
989         sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
990
991         sc->regs.vs = NVME_REV(1,4);    /* NVMe v1.4 */
992
993         sc->regs.cc = 0;
994         sc->regs.csts = 0;
995
996         assert(sc->submit_queues != NULL);
997
998         for (i = 0; i < sc->num_squeues + 1; i++) {
999                 sc->submit_queues[i].qbase = NULL;
1000                 sc->submit_queues[i].size = 0;
1001                 sc->submit_queues[i].cqid = 0;
1002                 sc->submit_queues[i].tail = 0;
1003                 sc->submit_queues[i].head = 0;
1004         }
1005
1006         assert(sc->compl_queues != NULL);
1007
1008         for (i = 0; i < sc->num_cqueues + 1; i++) {
1009                 sc->compl_queues[i].qbase = NULL;
1010                 sc->compl_queues[i].size = 0;
1011                 sc->compl_queues[i].tail = 0;
1012                 sc->compl_queues[i].head = 0;
1013         }
1014
1015         sc->num_q_is_set = false;
1016
1017         pci_nvme_aer_destroy(sc);
1018         pci_nvme_aen_destroy(sc);
1019 }
1020
1021 static void
1022 pci_nvme_reset(struct pci_nvme_softc *sc)
1023 {
1024         pthread_mutex_lock(&sc->mtx);
1025         pci_nvme_reset_locked(sc);
1026         pthread_mutex_unlock(&sc->mtx);
1027 }
1028
1029 static void
1030 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
1031 {
1032         uint16_t acqs, asqs;
1033
1034         DPRINTF("%s", __func__);
1035
1036         asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
1037         sc->submit_queues[0].size = asqs;
1038         sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
1039                     sizeof(struct nvme_command) * asqs);
1040
1041         DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
1042                 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
1043
1044         acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 
1045             NVME_AQA_REG_ACQS_MASK) + 1;
1046         sc->compl_queues[0].size = acqs;
1047         sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
1048                  sizeof(struct nvme_completion) * acqs);
1049         sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
1050
1051         DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
1052                 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
1053 }
1054
1055 static int
1056 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
1057         size_t len, enum nvme_copy_dir dir)
1058 {
1059         uint8_t *p;
1060         size_t bytes;
1061
1062         if (len > (8 * 1024)) {
1063                 return (-1);
1064         }
1065
1066         /* Copy from the start of prp1 to the end of the physical page */
1067         bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
1068         bytes = MIN(bytes, len);
1069
1070         p = vm_map_gpa(ctx, prp1, bytes);
1071         if (p == NULL) {
1072                 return (-1);
1073         }
1074
1075         if (dir == NVME_COPY_TO_PRP)
1076                 memcpy(p, b, bytes);
1077         else
1078                 memcpy(b, p, bytes);
1079
1080         b += bytes;
1081
1082         len -= bytes;
1083         if (len == 0) {
1084                 return (0);
1085         }
1086
1087         len = MIN(len, PAGE_SIZE);
1088
1089         p = vm_map_gpa(ctx, prp2, len);
1090         if (p == NULL) {
1091                 return (-1);
1092         }
1093
1094         if (dir == NVME_COPY_TO_PRP)
1095                 memcpy(p, b, len);
1096         else
1097                 memcpy(b, p, len);
1098
1099         return (0);
1100 }
1101
1102 /*
1103  * Write a Completion Queue Entry update
1104  *
1105  * Write the completion and update the doorbell value
1106  */
1107 static void
1108 pci_nvme_cq_update(struct pci_nvme_softc *sc,
1109                 struct nvme_completion_queue *cq,
1110                 uint32_t cdw0,
1111                 uint16_t cid,
1112                 uint16_t sqid,
1113                 uint16_t status)
1114 {
1115         struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
1116         struct nvme_completion *cqe;
1117
1118         assert(cq->qbase != NULL);
1119
1120         pthread_mutex_lock(&cq->mtx);
1121
1122         cqe = &cq->qbase[cq->tail];
1123
1124         /* Flip the phase bit */
1125         status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
1126
1127         cqe->cdw0 = cdw0;
1128         cqe->sqhd = sq->head;
1129         cqe->sqid = sqid;
1130         cqe->cid = cid;
1131         cqe->status = status;
1132
1133         cq->tail++;
1134         if (cq->tail >= cq->size) {
1135                 cq->tail = 0;
1136         }
1137
1138         pthread_mutex_unlock(&cq->mtx);
1139 }
1140
1141 static int
1142 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1143         struct nvme_completion* compl)
1144 {
1145         uint16_t qid = command->cdw10 & 0xffff;
1146
1147         DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
1148         if (qid == 0 || qid > sc->num_squeues ||
1149             (sc->submit_queues[qid].qbase == NULL)) {
1150                 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
1151                         __func__, qid, sc->num_squeues);
1152                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1153                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
1154                 return (1);
1155         }
1156
1157         sc->submit_queues[qid].qbase = NULL;
1158         sc->submit_queues[qid].cqid = 0;
1159         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1160         return (1);
1161 }
1162
1163 static int
1164 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1165         struct nvme_completion* compl)
1166 {
1167         if (command->cdw11 & NVME_CMD_CDW11_PC) {
1168                 uint16_t qid = command->cdw10 & 0xffff;
1169                 struct nvme_submission_queue *nsq;
1170
1171                 if ((qid == 0) || (qid > sc->num_squeues) ||
1172                     (sc->submit_queues[qid].qbase != NULL)) {
1173                         WPRINTF("%s queue index %u > num_squeues %u",
1174                                 __func__, qid, sc->num_squeues);
1175                         pci_nvme_status_tc(&compl->status,
1176                             NVME_SCT_COMMAND_SPECIFIC,
1177                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
1178                         return (1);
1179                 }
1180
1181                 nsq = &sc->submit_queues[qid];
1182                 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1183                 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
1184                 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
1185                         /*
1186                          * Queues must specify at least two entries
1187                          * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1188                          * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1189                          */
1190                         pci_nvme_status_tc(&compl->status,
1191                             NVME_SCT_COMMAND_SPECIFIC,
1192                             NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1193                         return (1);
1194                 }
1195                 nsq->head = nsq->tail = 0;
1196
1197                 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
1198                 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
1199                         pci_nvme_status_tc(&compl->status,
1200                             NVME_SCT_COMMAND_SPECIFIC,
1201                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
1202                         return (1);
1203                 }
1204
1205                 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
1206                         pci_nvme_status_tc(&compl->status,
1207                             NVME_SCT_COMMAND_SPECIFIC,
1208                             NVME_SC_COMPLETION_QUEUE_INVALID);
1209                         return (1);
1210                 }
1211
1212                 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
1213
1214                 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1215                               sizeof(struct nvme_command) * (size_t)nsq->size);
1216
1217                 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
1218                         qid, nsq->size, nsq->qbase, nsq->cqid);
1219
1220                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1221
1222                 DPRINTF("%s completed creating IOSQ qid %u",
1223                          __func__, qid);
1224         } else {
1225                 /* 
1226                  * Guest sent non-cont submission queue request.
1227                  * This setting is unsupported by this emulation.
1228                  */
1229                 WPRINTF("%s unsupported non-contig (list-based) "
1230                          "create i/o submission queue", __func__);
1231
1232                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1233         }
1234         return (1);
1235 }
1236
1237 static int
1238 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1239         struct nvme_completion* compl)
1240 {
1241         uint16_t qid = command->cdw10 & 0xffff;
1242         uint16_t sqid;
1243
1244         DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
1245         if (qid == 0 || qid > sc->num_cqueues ||
1246             (sc->compl_queues[qid].qbase == NULL)) {
1247                 WPRINTF("%s queue index %u / num_cqueues %u",
1248                         __func__, qid, sc->num_cqueues);
1249                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1250                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
1251                 return (1);
1252         }
1253
1254         /* Deleting an Active CQ is an error */
1255         for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
1256                 if (sc->submit_queues[sqid].cqid == qid) {
1257                         pci_nvme_status_tc(&compl->status,
1258                             NVME_SCT_COMMAND_SPECIFIC,
1259                             NVME_SC_INVALID_QUEUE_DELETION);
1260                         return (1);
1261                 }
1262
1263         sc->compl_queues[qid].qbase = NULL;
1264         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1265         return (1);
1266 }
1267
1268 static int
1269 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1270         struct nvme_completion* compl)
1271 {
1272         struct nvme_completion_queue *ncq;
1273         uint16_t qid = command->cdw10 & 0xffff;
1274
1275         /* Only support Physically Contiguous queues */
1276         if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1277                 WPRINTF("%s unsupported non-contig (list-based) "
1278                          "create i/o completion queue",
1279                          __func__);
1280
1281                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1282                 return (1);
1283         }
1284
1285         if ((qid == 0) || (qid > sc->num_cqueues) ||
1286             (sc->compl_queues[qid].qbase != NULL)) {
1287                 WPRINTF("%s queue index %u > num_cqueues %u",
1288                         __func__, qid, sc->num_cqueues);
1289                 pci_nvme_status_tc(&compl->status,
1290                     NVME_SCT_COMMAND_SPECIFIC,
1291                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
1292                 return (1);
1293         }
1294
1295         ncq = &sc->compl_queues[qid];
1296         ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1297         ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1298         if (ncq->intr_vec > (sc->max_queues + 1)) {
1299                 pci_nvme_status_tc(&compl->status,
1300                     NVME_SCT_COMMAND_SPECIFIC,
1301                     NVME_SC_INVALID_INTERRUPT_VECTOR);
1302                 return (1);
1303         }
1304
1305         ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1306         if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1307                 /*
1308                  * Queues must specify at least two entries
1309                  * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1310                  * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1311                  */
1312                 pci_nvme_status_tc(&compl->status,
1313                     NVME_SCT_COMMAND_SPECIFIC,
1314                     NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1315                 return (1);
1316         }
1317         ncq->head = ncq->tail = 0;
1318         ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1319                      command->prp1,
1320                      sizeof(struct nvme_command) * (size_t)ncq->size);
1321
1322         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1323
1324
1325         return (1);
1326 }
1327
1328 static int
1329 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1330         struct nvme_completion* compl)
1331 {
1332         uint64_t logoff;
1333         uint32_t logsize;
1334         uint8_t logpage = command->cdw10 & 0xFF;
1335
1336         DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1337
1338         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1339
1340         /*
1341          * Command specifies the number of dwords to return in fields NUMDU
1342          * and NUMDL. This is a zero-based value.
1343          */
1344         logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1345         logsize *= sizeof(uint32_t);
1346         logoff  = ((uint64_t)(command->cdw13) << 32) | command->cdw12;
1347
1348         switch (logpage) {
1349         case NVME_LOG_ERROR:
1350                 if (logoff >= sizeof(sc->err_log)) {
1351                         pci_nvme_status_genc(&compl->status,
1352                             NVME_SC_INVALID_FIELD);
1353                         break;
1354                 }
1355
1356                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1357                     command->prp2, (uint8_t *)&sc->err_log + logoff,
1358                     MIN(logsize - logoff, sizeof(sc->err_log)),
1359                     NVME_COPY_TO_PRP);
1360                 break;
1361         case NVME_LOG_HEALTH_INFORMATION:
1362                 if (logoff >= sizeof(sc->health_log)) {
1363                         pci_nvme_status_genc(&compl->status,
1364                             NVME_SC_INVALID_FIELD);
1365                         break;
1366                 }
1367
1368                 pthread_mutex_lock(&sc->mtx);
1369                 memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1370                     sizeof(sc->health_log.data_units_read));
1371                 memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1372                     sizeof(sc->health_log.data_units_written));
1373                 memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1374                     sizeof(sc->health_log.host_read_commands));
1375                 memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1376                     sizeof(sc->health_log.host_write_commands));
1377                 pthread_mutex_unlock(&sc->mtx);
1378
1379                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1380                     command->prp2, (uint8_t *)&sc->health_log + logoff,
1381                     MIN(logsize - logoff, sizeof(sc->health_log)),
1382                     NVME_COPY_TO_PRP);
1383                 break;
1384         case NVME_LOG_FIRMWARE_SLOT:
1385                 if (logoff >= sizeof(sc->fw_log)) {
1386                         pci_nvme_status_genc(&compl->status,
1387                             NVME_SC_INVALID_FIELD);
1388                         break;
1389                 }
1390
1391                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1392                     command->prp2, (uint8_t *)&sc->fw_log + logoff,
1393                     MIN(logsize - logoff, sizeof(sc->fw_log)),
1394                     NVME_COPY_TO_PRP);
1395                 break;
1396         case NVME_LOG_CHANGED_NAMESPACE:
1397                 if (logoff >= sizeof(sc->ns_log)) {
1398                         pci_nvme_status_genc(&compl->status,
1399                             NVME_SC_INVALID_FIELD);
1400                         break;
1401                 }
1402
1403                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1404                     command->prp2, (uint8_t *)&sc->ns_log + logoff,
1405                     MIN(logsize - logoff, sizeof(sc->ns_log)),
1406                     NVME_COPY_TO_PRP);
1407                 memset(&sc->ns_log, 0, sizeof(sc->ns_log));
1408                 break;
1409         default:
1410                 DPRINTF("%s get log page %x command not supported",
1411                         __func__, logpage);
1412
1413                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1414                     NVME_SC_INVALID_LOG_PAGE);
1415         }
1416
1417         return (1);
1418 }
1419
1420 static int
1421 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1422         struct nvme_completion* compl)
1423 {
1424         void *dest;
1425         uint16_t status;
1426
1427         DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1428                 command->cdw10 & 0xFF, command->nsid);
1429
1430         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1431
1432         switch (command->cdw10 & 0xFF) {
1433         case 0x00: /* return Identify Namespace data structure */
1434                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1435                     command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1436                     NVME_COPY_TO_PRP);
1437                 break;
1438         case 0x01: /* return Identify Controller data structure */
1439                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1440                     command->prp2, (uint8_t *)&sc->ctrldata,
1441                     sizeof(sc->ctrldata),
1442                     NVME_COPY_TO_PRP);
1443                 break;
1444         case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1445                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1446                                   sizeof(uint32_t) * 1024);
1447                 /* All unused entries shall be zero */
1448                 bzero(dest, sizeof(uint32_t) * 1024);
1449                 ((uint32_t *)dest)[0] = 1;
1450                 break;
1451         case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1452                 if (command->nsid != 1) {
1453                         pci_nvme_status_genc(&status,
1454                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1455                         break;
1456                 }
1457                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1458                                   sizeof(uint32_t) * 1024);
1459                 /* All bytes after the descriptor shall be zero */
1460                 bzero(dest, sizeof(uint32_t) * 1024);
1461
1462                 /* Return NIDT=1 (i.e. EUI64) descriptor */
1463                 ((uint8_t *)dest)[0] = 1;
1464                 ((uint8_t *)dest)[1] = sizeof(uint64_t);
1465                 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1466                 break;
1467         default:
1468                 DPRINTF("%s unsupported identify command requested 0x%x",
1469                          __func__, command->cdw10 & 0xFF);
1470                 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1471                 break;
1472         }
1473
1474         compl->status = status;
1475         return (1);
1476 }
1477
1478 static const char *
1479 nvme_fid_to_name(uint8_t fid)
1480 {
1481         const char *name;
1482
1483         switch (fid) {
1484         case NVME_FEAT_ARBITRATION:
1485                 name = "Arbitration";
1486                 break;
1487         case NVME_FEAT_POWER_MANAGEMENT:
1488                 name = "Power Management";
1489                 break;
1490         case NVME_FEAT_LBA_RANGE_TYPE:
1491                 name = "LBA Range Type";
1492                 break;
1493         case NVME_FEAT_TEMPERATURE_THRESHOLD:
1494                 name = "Temperature Threshold";
1495                 break;
1496         case NVME_FEAT_ERROR_RECOVERY:
1497                 name = "Error Recovery";
1498                 break;
1499         case NVME_FEAT_VOLATILE_WRITE_CACHE:
1500                 name = "Volatile Write Cache";
1501                 break;
1502         case NVME_FEAT_NUMBER_OF_QUEUES:
1503                 name = "Number of Queues";
1504                 break;
1505         case NVME_FEAT_INTERRUPT_COALESCING:
1506                 name = "Interrupt Coalescing";
1507                 break;
1508         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1509                 name = "Interrupt Vector Configuration";
1510                 break;
1511         case NVME_FEAT_WRITE_ATOMICITY:
1512                 name = "Write Atomicity Normal";
1513                 break;
1514         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1515                 name = "Asynchronous Event Configuration";
1516                 break;
1517         case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1518                 name = "Autonomous Power State Transition";
1519                 break;
1520         case NVME_FEAT_HOST_MEMORY_BUFFER:
1521                 name = "Host Memory Buffer";
1522                 break;
1523         case NVME_FEAT_TIMESTAMP:
1524                 name = "Timestamp";
1525                 break;
1526         case NVME_FEAT_KEEP_ALIVE_TIMER:
1527                 name = "Keep Alive Timer";
1528                 break;
1529         case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1530                 name = "Host Controlled Thermal Management";
1531                 break;
1532         case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1533                 name = "Non-Operation Power State Config";
1534                 break;
1535         case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1536                 name = "Read Recovery Level Config";
1537                 break;
1538         case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1539                 name = "Predictable Latency Mode Config";
1540                 break;
1541         case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1542                 name = "Predictable Latency Mode Window";
1543                 break;
1544         case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1545                 name = "LBA Status Information Report Interval";
1546                 break;
1547         case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1548                 name = "Host Behavior Support";
1549                 break;
1550         case NVME_FEAT_SANITIZE_CONFIG:
1551                 name = "Sanitize Config";
1552                 break;
1553         case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1554                 name = "Endurance Group Event Configuration";
1555                 break;
1556         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1557                 name = "Software Progress Marker";
1558                 break;
1559         case NVME_FEAT_HOST_IDENTIFIER:
1560                 name = "Host Identifier";
1561                 break;
1562         case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1563                 name = "Reservation Notification Mask";
1564                 break;
1565         case NVME_FEAT_RESERVATION_PERSISTENCE:
1566                 name = "Reservation Persistence";
1567                 break;
1568         case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1569                 name = "Namespace Write Protection Config";
1570                 break;
1571         default:
1572                 name = "Unknown";
1573                 break;
1574         }
1575
1576         return (name);
1577 }
1578
1579 static void
1580 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1581     struct nvme_feature_obj *feat,
1582     struct nvme_command *command,
1583     struct nvme_completion *compl)
1584 {
1585
1586         pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1587 }
1588
1589 static void
1590 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1591     struct nvme_feature_obj *feat,
1592     struct nvme_command *command,
1593     struct nvme_completion *compl)
1594 {
1595         uint32_t i;
1596         uint32_t cdw11 = command->cdw11;
1597         uint16_t iv;
1598         bool cd;
1599
1600         pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1601
1602         iv = cdw11 & 0xffff;
1603         cd = cdw11 & (1 << 16);
1604
1605         if (iv > (sc->max_queues + 1)) {
1606                 return;
1607         }
1608
1609         /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1610         if ((iv == 0) && !cd)
1611                 return;
1612
1613         /* Requested Interrupt Vector must be used by a CQ */
1614         for (i = 0; i < sc->num_cqueues + 1; i++) {
1615                 if (sc->compl_queues[i].intr_vec == iv) {
1616                         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1617                 }
1618         }
1619
1620 }
1621
1622 static void
1623 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1624     struct nvme_feature_obj *feat,
1625     struct nvme_command *command,
1626     struct nvme_completion *compl)
1627 {
1628         uint16_t nqr;   /* Number of Queues Requested */
1629
1630         if (sc->num_q_is_set) {
1631                 WPRINTF("%s: Number of Queues already set", __func__);
1632                 pci_nvme_status_genc(&compl->status,
1633                     NVME_SC_COMMAND_SEQUENCE_ERROR);
1634                 return;
1635         }
1636
1637         nqr = command->cdw11 & 0xFFFF;
1638         if (nqr == 0xffff) {
1639                 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1640                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1641                 return;
1642         }
1643
1644         sc->num_squeues = ONE_BASED(nqr);
1645         if (sc->num_squeues > sc->max_queues) {
1646                 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1647                                         sc->max_queues);
1648                 sc->num_squeues = sc->max_queues;
1649         }
1650
1651         nqr = (command->cdw11 >> 16) & 0xFFFF;
1652         if (nqr == 0xffff) {
1653                 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1654                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1655                 return;
1656         }
1657
1658         sc->num_cqueues = ONE_BASED(nqr);
1659         if (sc->num_cqueues > sc->max_queues) {
1660                 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1661                                         sc->max_queues);
1662                 sc->num_cqueues = sc->max_queues;
1663         }
1664
1665         /* Patch the command value which will be saved on callback's return */
1666         command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1667         compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1668
1669         sc->num_q_is_set = true;
1670 }
1671
1672 static int
1673 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1674         struct nvme_completion *compl)
1675 {
1676         struct nvme_feature_obj *feat;
1677         uint32_t nsid = command->nsid;
1678         uint8_t fid = command->cdw10 & 0xFF;
1679
1680         DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1681
1682         if (fid >= NVME_FID_MAX) {
1683                 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1684                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1685                 return (1);
1686         }
1687         feat = &sc->feat[fid];
1688
1689         if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) {
1690                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1691                 return (1);
1692         }
1693
1694         if (!feat->namespace_specific &&
1695             !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1696                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1697                     NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1698                 return (1);
1699         }
1700
1701         compl->cdw0 = 0;
1702         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1703
1704         if (feat->set)
1705                 feat->set(sc, feat, command, compl);
1706
1707         DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11);
1708         if (compl->status == NVME_SC_SUCCESS) {
1709                 feat->cdw11 = command->cdw11;
1710                 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) &&
1711                     (command->cdw11 != 0))
1712                         pci_nvme_aen_notify(sc);
1713         }
1714
1715         return (0);
1716 }
1717
1718 static int
1719 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1720         struct nvme_completion* compl)
1721 {
1722         struct nvme_feature_obj *feat;
1723         uint8_t fid = command->cdw10 & 0xFF;
1724
1725         DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1726
1727         if (fid >= NVME_FID_MAX) {
1728                 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1729                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1730                 return (1);
1731         }
1732
1733         compl->cdw0 = 0;
1734         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1735
1736         feat = &sc->feat[fid];
1737         if (feat->get) {
1738                 feat->get(sc, feat, command, compl);
1739         }
1740
1741         if (compl->status == NVME_SC_SUCCESS) {
1742                 compl->cdw0 = feat->cdw11;
1743         }
1744
1745         return (0);
1746 }
1747
1748 static int
1749 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1750         struct nvme_completion* compl)
1751 {
1752         uint8_t ses, lbaf, pi;
1753
1754         /* Only supports Secure Erase Setting - User Data Erase */
1755         ses = (command->cdw10 >> 9) & 0x7;
1756         if (ses > 0x1) {
1757                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1758                 return (1);
1759         }
1760
1761         /* Only supports a single LBA Format */
1762         lbaf = command->cdw10 & 0xf;
1763         if (lbaf != 0) {
1764                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1765                     NVME_SC_INVALID_FORMAT);
1766                 return (1);
1767         }
1768
1769         /* Doesn't support Protection Infomation */
1770         pi = (command->cdw10 >> 5) & 0x7;
1771         if (pi != 0) {
1772                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1773                 return (1);
1774         }
1775
1776         if (sc->nvstore.type == NVME_STOR_RAM) {
1777                 if (sc->nvstore.ctx)
1778                         free(sc->nvstore.ctx);
1779                 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1780                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1781         } else {
1782                 struct pci_nvme_ioreq *req;
1783                 int err;
1784
1785                 req = pci_nvme_get_ioreq(sc);
1786                 if (req == NULL) {
1787                         pci_nvme_status_genc(&compl->status,
1788                             NVME_SC_INTERNAL_DEVICE_ERROR);
1789                         WPRINTF("%s: unable to allocate IO req", __func__);
1790                         return (1);
1791                 }
1792                 req->nvme_sq = &sc->submit_queues[0];
1793                 req->sqid = 0;
1794                 req->opc = command->opc;
1795                 req->cid = command->cid;
1796                 req->nsid = command->nsid;
1797
1798                 req->io_req.br_offset = 0;
1799                 req->io_req.br_resid = sc->nvstore.size;
1800                 req->io_req.br_callback = pci_nvme_io_done;
1801
1802                 err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1803                 if (err) {
1804                         pci_nvme_status_genc(&compl->status,
1805                             NVME_SC_INTERNAL_DEVICE_ERROR);
1806                         pci_nvme_release_ioreq(sc, req);
1807                 } else
1808                         compl->status = NVME_NO_STATUS;
1809         }
1810
1811         return (1);
1812 }
1813
1814 static int
1815 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1816         struct nvme_completion* compl)
1817 {
1818         DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1819                 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1820
1821         /* TODO: search for the command ID and abort it */
1822
1823         compl->cdw0 = 1;
1824         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1825         return (1);
1826 }
1827
1828 static int
1829 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1830         struct nvme_command* command, struct nvme_completion* compl)
1831 {
1832         DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__,
1833             sc->aer_count, sc->ctrldata.aerl, command->cid);
1834
1835         /* Don't exceed the Async Event Request Limit (AERL). */
1836         if (pci_nvme_aer_limit_reached(sc)) {
1837                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1838                                 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1839                 return (1);
1840         }
1841
1842         if (pci_nvme_aer_add(sc, command->cid)) {
1843                 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1844                                 NVME_SC_INTERNAL_DEVICE_ERROR);
1845                 return (1);
1846         }
1847
1848         /*
1849          * Raise events when they happen based on the Set Features cmd.
1850          * These events happen async, so only set completion successful if
1851          * there is an event reflective of the request to get event.
1852          */
1853         compl->status = NVME_NO_STATUS;
1854         pci_nvme_aen_notify(sc);
1855
1856         return (0);
1857 }
1858
1859 static void
1860 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1861 {
1862         struct nvme_completion compl;
1863         struct nvme_command *cmd;
1864         struct nvme_submission_queue *sq;
1865         struct nvme_completion_queue *cq;
1866         uint16_t sqhead;
1867
1868         DPRINTF("%s index %u", __func__, (uint32_t)value);
1869
1870         sq = &sc->submit_queues[0];
1871         cq = &sc->compl_queues[0];
1872
1873         pthread_mutex_lock(&sq->mtx);
1874
1875         sqhead = sq->head;
1876         DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1877         
1878         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1879                 cmd = &(sq->qbase)[sqhead];
1880                 compl.cdw0 = 0;
1881                 compl.status = 0;
1882
1883                 switch (cmd->opc) {
1884                 case NVME_OPC_DELETE_IO_SQ:
1885                         DPRINTF("%s command DELETE_IO_SQ", __func__);
1886                         nvme_opc_delete_io_sq(sc, cmd, &compl);
1887                         break;
1888                 case NVME_OPC_CREATE_IO_SQ:
1889                         DPRINTF("%s command CREATE_IO_SQ", __func__);
1890                         nvme_opc_create_io_sq(sc, cmd, &compl);
1891                         break;
1892                 case NVME_OPC_DELETE_IO_CQ:
1893                         DPRINTF("%s command DELETE_IO_CQ", __func__);
1894                         nvme_opc_delete_io_cq(sc, cmd, &compl);
1895                         break;
1896                 case NVME_OPC_CREATE_IO_CQ:
1897                         DPRINTF("%s command CREATE_IO_CQ", __func__);
1898                         nvme_opc_create_io_cq(sc, cmd, &compl);
1899                         break;
1900                 case NVME_OPC_GET_LOG_PAGE:
1901                         DPRINTF("%s command GET_LOG_PAGE", __func__);
1902                         nvme_opc_get_log_page(sc, cmd, &compl);
1903                         break;
1904                 case NVME_OPC_IDENTIFY:
1905                         DPRINTF("%s command IDENTIFY", __func__);
1906                         nvme_opc_identify(sc, cmd, &compl);
1907                         break;
1908                 case NVME_OPC_ABORT:
1909                         DPRINTF("%s command ABORT", __func__);
1910                         nvme_opc_abort(sc, cmd, &compl);
1911                         break;
1912                 case NVME_OPC_SET_FEATURES:
1913                         DPRINTF("%s command SET_FEATURES", __func__);
1914                         nvme_opc_set_features(sc, cmd, &compl);
1915                         break;
1916                 case NVME_OPC_GET_FEATURES:
1917                         DPRINTF("%s command GET_FEATURES", __func__);
1918                         nvme_opc_get_features(sc, cmd, &compl);
1919                         break;
1920                 case NVME_OPC_FIRMWARE_ACTIVATE:
1921                         DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
1922                         pci_nvme_status_tc(&compl.status,
1923                             NVME_SCT_COMMAND_SPECIFIC,
1924                             NVME_SC_INVALID_FIRMWARE_SLOT);
1925                         break;
1926                 case NVME_OPC_ASYNC_EVENT_REQUEST:
1927                         DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1928                         nvme_opc_async_event_req(sc, cmd, &compl);
1929                         break;
1930                 case NVME_OPC_FORMAT_NVM:
1931                         DPRINTF("%s command FORMAT_NVM", __func__);
1932                         if ((sc->ctrldata.oacs &
1933                             (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1934                                 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1935                                 break;
1936                         }
1937                         nvme_opc_format_nvm(sc, cmd, &compl);
1938                         break;
1939                 default:
1940                         DPRINTF("0x%x command is not implemented",
1941                             cmd->opc);
1942                         pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1943                 }
1944                 sqhead = (sqhead + 1) % sq->size;
1945
1946                 if (NVME_COMPLETION_VALID(compl)) {
1947                         pci_nvme_cq_update(sc, &sc->compl_queues[0],
1948                             compl.cdw0,
1949                             cmd->cid,
1950                             0,          /* SQID */
1951                             compl.status);
1952                 }
1953         }
1954
1955         DPRINTF("setting sqhead %u", sqhead);
1956         sq->head = sqhead;
1957
1958         if (cq->head != cq->tail)
1959                 pci_generate_msix(sc->nsc_pi, 0);
1960
1961         pthread_mutex_unlock(&sq->mtx);
1962 }
1963
1964 /*
1965  * Update the Write and Read statistics reported in SMART data
1966  *
1967  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
1968  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
1969  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
1970  */
1971 static void
1972 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
1973     size_t bytes, uint16_t status)
1974 {
1975
1976         pthread_mutex_lock(&sc->mtx);
1977         switch (opc) {
1978         case NVME_OPC_WRITE:
1979                 sc->write_commands++;
1980                 if (status != NVME_SC_SUCCESS)
1981                         break;
1982                 sc->write_dunits_remainder += (bytes / 512);
1983                 while (sc->write_dunits_remainder >= 1000) {
1984                         sc->write_data_units++;
1985                         sc->write_dunits_remainder -= 1000;
1986                 }
1987                 break;
1988         case NVME_OPC_READ:
1989                 sc->read_commands++;
1990                 if (status != NVME_SC_SUCCESS)
1991                         break;
1992                 sc->read_dunits_remainder += (bytes / 512);
1993                 while (sc->read_dunits_remainder >= 1000) {
1994                         sc->read_data_units++;
1995                         sc->read_dunits_remainder -= 1000;
1996                 }
1997                 break;
1998         default:
1999                 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
2000                 break;
2001         }
2002         pthread_mutex_unlock(&sc->mtx);
2003 }
2004
2005 /*
2006  * Check if the combination of Starting LBA (slba) and Number of Logical
2007  * Blocks (nlb) exceeds the range of the underlying storage.
2008  *
2009  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
2010  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
2011  * overflow.
2012  */
2013 static bool
2014 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
2015     uint32_t nlb)
2016 {
2017         size_t  offset, bytes;
2018
2019         /* Overflow check of multiplying Starting LBA by the sector size */
2020         if (slba >> (64 - nvstore->sectsz_bits))
2021                 return (true);
2022
2023         offset = slba << nvstore->sectsz_bits;
2024         bytes = nlb << nvstore->sectsz_bits;
2025
2026         /* Overflow check of Number of Logical Blocks */
2027         if ((nvstore->size - offset) < bytes)
2028                 return (true);
2029
2030         return (false);
2031 }
2032
2033 static int
2034 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
2035         uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
2036 {
2037         int iovidx;
2038
2039         if (req == NULL)
2040                 return (-1);
2041
2042         if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
2043                 return (-1);
2044         }
2045
2046         /* concatenate contig block-iovs to minimize number of iovs */
2047         if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
2048                 iovidx = req->io_req.br_iovcnt - 1;
2049
2050                 req->io_req.br_iov[iovidx].iov_base =
2051                     paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2052                                      req->prev_gpaddr, size);
2053
2054                 req->prev_size += size;
2055                 req->io_req.br_resid += size;
2056
2057                 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
2058         } else {
2059                 iovidx = req->io_req.br_iovcnt;
2060                 if (iovidx == 0) {
2061                         req->io_req.br_offset = lba;
2062                         req->io_req.br_resid = 0;
2063                         req->io_req.br_param = req;
2064                 }
2065
2066                 req->io_req.br_iov[iovidx].iov_base =
2067                     paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2068                                      gpaddr, size);
2069
2070                 req->io_req.br_iov[iovidx].iov_len = size;
2071
2072                 req->prev_gpaddr = gpaddr;
2073                 req->prev_size = size;
2074                 req->io_req.br_resid += size;
2075
2076                 req->io_req.br_iovcnt++;
2077         }
2078
2079         return (0);
2080 }
2081
2082 static void
2083 pci_nvme_set_completion(struct pci_nvme_softc *sc,
2084         struct nvme_submission_queue *sq, int sqid, uint16_t cid,
2085         uint32_t cdw0, uint16_t status)
2086 {
2087         struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
2088
2089         DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
2090                  __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
2091                  NVME_STATUS_GET_SC(status));
2092
2093         pci_nvme_cq_update(sc, cq,
2094             0,          /* CDW0 */
2095             cid,
2096             sqid,
2097             status);
2098
2099         if (cq->head != cq->tail) {
2100                 if (cq->intr_en & NVME_CQ_INTEN) {
2101                         pci_generate_msix(sc->nsc_pi, cq->intr_vec);
2102                 } else {
2103                         DPRINTF("%s: CQ%u interrupt disabled",
2104                                                 __func__, sq->cqid);
2105                 }
2106         }
2107 }
2108
2109 static void
2110 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
2111 {
2112         req->sc = NULL;
2113         req->nvme_sq = NULL;
2114         req->sqid = 0;
2115
2116         pthread_mutex_lock(&sc->mtx);
2117
2118         STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
2119         sc->pending_ios--;
2120
2121         /* when no more IO pending, can set to ready if device reset/enabled */
2122         if (sc->pending_ios == 0 &&
2123             NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
2124                 sc->regs.csts |= NVME_CSTS_RDY;
2125
2126         pthread_mutex_unlock(&sc->mtx);
2127
2128         sem_post(&sc->iosemlock);
2129 }
2130
2131 static struct pci_nvme_ioreq *
2132 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
2133 {
2134         struct pci_nvme_ioreq *req = NULL;
2135
2136         sem_wait(&sc->iosemlock);
2137         pthread_mutex_lock(&sc->mtx);
2138
2139         req = STAILQ_FIRST(&sc->ioreqs_free);
2140         assert(req != NULL);
2141         STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
2142
2143         req->sc = sc;
2144
2145         sc->pending_ios++;
2146
2147         pthread_mutex_unlock(&sc->mtx);
2148
2149         req->io_req.br_iovcnt = 0;
2150         req->io_req.br_offset = 0;
2151         req->io_req.br_resid = 0;
2152         req->io_req.br_param = req;
2153         req->prev_gpaddr = 0;
2154         req->prev_size = 0;
2155
2156         return req;
2157 }
2158
2159 static void
2160 pci_nvme_io_done(struct blockif_req *br, int err)
2161 {
2162         struct pci_nvme_ioreq *req = br->br_param;
2163         struct nvme_submission_queue *sq = req->nvme_sq;
2164         uint16_t code, status;
2165
2166         DPRINTF("%s error %d %s", __func__, err, strerror(err));
2167
2168         /* TODO return correct error */
2169         code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
2170         pci_nvme_status_genc(&status, code);
2171
2172         pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
2173         pci_nvme_stats_write_read_update(req->sc, req->opc,
2174             req->bytes, status);
2175         pci_nvme_release_ioreq(req->sc, req);
2176 }
2177
2178 /*
2179  * Implements the Flush command. The specification states:
2180  *    If a volatile write cache is not present, Flush commands complete
2181  *    successfully and have no effect
2182  * in the description of the Volatile Write Cache (VWC) field of the Identify
2183  * Controller data. Therefore, set status to Success if the command is
2184  * not supported (i.e. RAM or as indicated by the blockif).
2185  */
2186 static bool
2187 nvme_opc_flush(struct pci_nvme_softc *sc,
2188     struct nvme_command *cmd,
2189     struct pci_nvme_blockstore *nvstore,
2190     struct pci_nvme_ioreq *req,
2191     uint16_t *status)
2192 {
2193         bool pending = false;
2194
2195         if (nvstore->type == NVME_STOR_RAM) {
2196                 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2197         } else {
2198                 int err;
2199
2200                 req->io_req.br_callback = pci_nvme_io_done;
2201
2202                 err = blockif_flush(nvstore->ctx, &req->io_req);
2203                 switch (err) {
2204                 case 0:
2205                         pending = true;
2206                         break;
2207                 case EOPNOTSUPP:
2208                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2209                         break;
2210                 default:
2211                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2212                 }
2213         }
2214
2215         return (pending);
2216 }
2217
2218 static uint16_t
2219 nvme_write_read_ram(struct pci_nvme_softc *sc,
2220     struct pci_nvme_blockstore *nvstore,
2221     uint64_t prp1, uint64_t prp2,
2222     size_t offset, uint64_t bytes,
2223     bool is_write)
2224 {
2225         uint8_t *buf = nvstore->ctx;
2226         enum nvme_copy_dir dir;
2227         uint16_t status;
2228
2229         if (is_write)
2230                 dir = NVME_COPY_TO_PRP;
2231         else
2232                 dir = NVME_COPY_FROM_PRP;
2233
2234         if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
2235             buf + offset, bytes, dir))
2236                 pci_nvme_status_genc(&status,
2237                     NVME_SC_DATA_TRANSFER_ERROR);
2238         else
2239                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2240
2241         return (status);
2242 }
2243
2244 static uint16_t
2245 nvme_write_read_blockif(struct pci_nvme_softc *sc,
2246     struct pci_nvme_blockstore *nvstore,
2247     struct pci_nvme_ioreq *req,
2248     uint64_t prp1, uint64_t prp2,
2249     size_t offset, uint64_t bytes,
2250     bool is_write)
2251 {
2252         uint64_t size;
2253         int err;
2254         uint16_t status = NVME_NO_STATUS;
2255
2256         size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
2257         if (pci_nvme_append_iov_req(sc, req, prp1,
2258             size, is_write, offset)) {
2259                 pci_nvme_status_genc(&status,
2260                     NVME_SC_DATA_TRANSFER_ERROR);
2261                 goto out;
2262         }
2263
2264         offset += size;
2265         bytes  -= size;
2266
2267         if (bytes == 0) {
2268                 ;
2269         } else if (bytes <= PAGE_SIZE) {
2270                 size = bytes;
2271                 if (pci_nvme_append_iov_req(sc, req, prp2,
2272                     size, is_write, offset)) {
2273                         pci_nvme_status_genc(&status,
2274                             NVME_SC_DATA_TRANSFER_ERROR);
2275                         goto out;
2276                 }
2277         } else {
2278                 void *vmctx = sc->nsc_pi->pi_vmctx;
2279                 uint64_t *prp_list = &prp2;
2280                 uint64_t *last = prp_list;
2281
2282                 /* PRP2 is pointer to a physical region page list */
2283                 while (bytes) {
2284                         /* Last entry in list points to the next list */
2285                         if ((prp_list == last) && (bytes > PAGE_SIZE)) {
2286                                 uint64_t prp = *prp_list;
2287
2288                                 prp_list = paddr_guest2host(vmctx, prp,
2289                                     PAGE_SIZE - (prp % PAGE_SIZE));
2290                                 last = prp_list + (NVME_PRP2_ITEMS - 1);
2291                         }
2292
2293                         size = MIN(bytes, PAGE_SIZE);
2294
2295                         if (pci_nvme_append_iov_req(sc, req, *prp_list,
2296                             size, is_write, offset)) {
2297                                 pci_nvme_status_genc(&status,
2298                                     NVME_SC_DATA_TRANSFER_ERROR);
2299                                 goto out;
2300                         }
2301
2302                         offset += size;
2303                         bytes  -= size;
2304
2305                         prp_list++;
2306                 }
2307         }
2308         req->io_req.br_callback = pci_nvme_io_done;
2309         if (is_write)
2310                 err = blockif_write(nvstore->ctx, &req->io_req);
2311         else
2312                 err = blockif_read(nvstore->ctx, &req->io_req);
2313
2314         if (err)
2315                 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2316 out:
2317         return (status);
2318 }
2319
2320 static bool
2321 nvme_opc_write_read(struct pci_nvme_softc *sc,
2322     struct nvme_command *cmd,
2323     struct pci_nvme_blockstore *nvstore,
2324     struct pci_nvme_ioreq *req,
2325     uint16_t *status)
2326 {
2327         uint64_t lba, nblocks, bytes;
2328         size_t offset;
2329         bool is_write = cmd->opc == NVME_OPC_WRITE;
2330         bool pending = false;
2331
2332         lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2333         nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2334
2335         if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2336                 WPRINTF("%s command would exceed LBA range", __func__);
2337                 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2338                 goto out;
2339         }
2340
2341         bytes  = nblocks << nvstore->sectsz_bits;
2342         if (bytes > NVME_MAX_DATA_SIZE) {
2343                 WPRINTF("%s command would exceed MDTS", __func__);
2344                 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2345                 goto out;
2346         }
2347
2348         offset = lba << nvstore->sectsz_bits;
2349
2350         req->bytes = bytes;
2351         req->io_req.br_offset = lba;
2352
2353         /* PRP bits 1:0 must be zero */
2354         cmd->prp1 &= ~0x3UL;
2355         cmd->prp2 &= ~0x3UL;
2356
2357         if (nvstore->type == NVME_STOR_RAM) {
2358                 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2359                     cmd->prp2, offset, bytes, is_write);
2360         } else {
2361                 *status = nvme_write_read_blockif(sc, nvstore, req,
2362                     cmd->prp1, cmd->prp2, offset, bytes, is_write);
2363
2364                 if (*status == NVME_NO_STATUS)
2365                         pending = true;
2366         }
2367 out:
2368         if (!pending)
2369                 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2370
2371         return (pending);
2372 }
2373
2374 static void
2375 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2376 {
2377         struct pci_nvme_ioreq *req = br->br_param;
2378         struct pci_nvme_softc *sc = req->sc;
2379         bool done = true;
2380         uint16_t status;
2381
2382         if (err) {
2383                 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2384         } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2385                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2386         } else {
2387                 struct iovec *iov = req->io_req.br_iov;
2388
2389                 req->prev_gpaddr++;
2390                 iov += req->prev_gpaddr;
2391
2392                 /* The iov_* values already include the sector size */
2393                 req->io_req.br_offset = (off_t)iov->iov_base;
2394                 req->io_req.br_resid = iov->iov_len;
2395                 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2396                         pci_nvme_status_genc(&status,
2397                             NVME_SC_INTERNAL_DEVICE_ERROR);
2398                 } else
2399                         done = false;
2400         }
2401
2402         if (done) {
2403                 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2404                     req->cid, 0, status);
2405                 pci_nvme_release_ioreq(sc, req);
2406         }
2407 }
2408
2409 static bool
2410 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2411     struct nvme_command *cmd,
2412     struct pci_nvme_blockstore *nvstore,
2413     struct pci_nvme_ioreq *req,
2414     uint16_t *status)
2415 {
2416         struct nvme_dsm_range *range;
2417         uint32_t nr, r, non_zero, dr;
2418         int err;
2419         bool pending = false;
2420
2421         if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2422                 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2423                 goto out;
2424         }
2425
2426         nr = cmd->cdw10 & 0xff;
2427
2428         /* copy locally because a range entry could straddle PRPs */
2429         range = calloc(1, NVME_MAX_DSM_TRIM);
2430         if (range == NULL) {
2431                 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2432                 goto out;
2433         }
2434         nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2435             (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2436
2437         /* Check for invalid ranges and the number of non-zero lengths */
2438         non_zero = 0;
2439         for (r = 0; r <= nr; r++) {
2440                 if (pci_nvme_out_of_range(nvstore,
2441                     range[r].starting_lba, range[r].length)) {
2442                         pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2443                         goto out;
2444                 }
2445                 if (range[r].length != 0)
2446                         non_zero++;
2447         }
2448
2449         if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2450                 size_t offset, bytes;
2451                 int sectsz_bits = sc->nvstore.sectsz_bits;
2452
2453                 /*
2454                  * DSM calls are advisory only, and compliant controllers
2455                  * may choose to take no actions (i.e. return Success).
2456                  */
2457                 if (!nvstore->deallocate) {
2458                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2459                         goto out;
2460                 }
2461
2462                 /* If all ranges have a zero length, return Success */
2463                 if (non_zero == 0) {
2464                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2465                         goto out;
2466                 }
2467
2468                 if (req == NULL) {
2469                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2470                         goto out;
2471                 }
2472
2473                 offset = range[0].starting_lba << sectsz_bits;
2474                 bytes = range[0].length << sectsz_bits;
2475
2476                 /*
2477                  * If the request is for more than a single range, store
2478                  * the ranges in the br_iov. Optimize for the common case
2479                  * of a single range.
2480                  *
2481                  * Note that NVMe Number of Ranges is a zero based value
2482                  */
2483                 req->io_req.br_iovcnt = 0;
2484                 req->io_req.br_offset = offset;
2485                 req->io_req.br_resid = bytes;
2486
2487                 if (nr == 0) {
2488                         req->io_req.br_callback = pci_nvme_io_done;
2489                 } else {
2490                         struct iovec *iov = req->io_req.br_iov;
2491
2492                         for (r = 0, dr = 0; r <= nr; r++) {
2493                                 offset = range[r].starting_lba << sectsz_bits;
2494                                 bytes = range[r].length << sectsz_bits;
2495                                 if (bytes == 0)
2496                                         continue;
2497
2498                                 if ((nvstore->size - offset) < bytes) {
2499                                         pci_nvme_status_genc(status,
2500                                             NVME_SC_LBA_OUT_OF_RANGE);
2501                                         goto out;
2502                                 }
2503                                 iov[dr].iov_base = (void *)offset;
2504                                 iov[dr].iov_len = bytes;
2505                                 dr++;
2506                         }
2507                         req->io_req.br_callback = pci_nvme_dealloc_sm;
2508
2509                         /*
2510                          * Use prev_gpaddr to track the current entry and
2511                          * prev_size to track the number of entries
2512                          */
2513                         req->prev_gpaddr = 0;
2514                         req->prev_size = dr;
2515                 }
2516
2517                 err = blockif_delete(nvstore->ctx, &req->io_req);
2518                 if (err)
2519                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2520                 else
2521                         pending = true;
2522         }
2523 out:
2524         free(range);
2525         return (pending);
2526 }
2527
2528 static void
2529 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2530 {
2531         struct nvme_submission_queue *sq;
2532         uint16_t status;
2533         uint16_t sqhead;
2534
2535         /* handle all submissions up to sq->tail index */
2536         sq = &sc->submit_queues[idx];
2537
2538         pthread_mutex_lock(&sq->mtx);
2539
2540         sqhead = sq->head;
2541         DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2542                  idx, sqhead, sq->tail, sq->qbase);
2543
2544         while (sqhead != atomic_load_acq_short(&sq->tail)) {
2545                 struct nvme_command *cmd;
2546                 struct pci_nvme_ioreq *req;
2547                 uint32_t nsid;
2548                 bool pending;
2549
2550                 pending = false;
2551                 req = NULL;
2552                 status = 0;
2553
2554                 cmd = &sq->qbase[sqhead];
2555                 sqhead = (sqhead + 1) % sq->size;
2556
2557                 nsid = le32toh(cmd->nsid);
2558                 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2559                         pci_nvme_status_genc(&status,
2560                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2561                         status |=
2562                             NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2563                         goto complete;
2564                 }
2565
2566                 req = pci_nvme_get_ioreq(sc);
2567                 if (req == NULL) {
2568                         pci_nvme_status_genc(&status,
2569                             NVME_SC_INTERNAL_DEVICE_ERROR);
2570                         WPRINTF("%s: unable to allocate IO req", __func__);
2571                         goto complete;
2572                 }
2573                 req->nvme_sq = sq;
2574                 req->sqid = idx;
2575                 req->opc = cmd->opc;
2576                 req->cid = cmd->cid;
2577                 req->nsid = cmd->nsid;
2578
2579                 switch (cmd->opc) {
2580                 case NVME_OPC_FLUSH:
2581                         pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2582                             req, &status);
2583                         break;
2584                 case NVME_OPC_WRITE:
2585                 case NVME_OPC_READ:
2586                         pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2587                             req, &status);
2588                         break;
2589                 case NVME_OPC_WRITE_ZEROES:
2590                         /* TODO: write zeroes
2591                         WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2592                                 __func__, lba, cmd->cdw12 & 0xFFFF); */
2593                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2594                         break;
2595                 case NVME_OPC_DATASET_MANAGEMENT:
2596                         pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2597                             req, &status);
2598                         break;
2599                 default:
2600                         WPRINTF("%s unhandled io command 0x%x",
2601                             __func__, cmd->opc);
2602                         pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2603                 }
2604 complete:
2605                 if (!pending) {
2606                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2607                             status);
2608                         if (req != NULL)
2609                                 pci_nvme_release_ioreq(sc, req);
2610                 }
2611         }
2612
2613         sq->head = sqhead;
2614
2615         pthread_mutex_unlock(&sq->mtx);
2616 }
2617
2618 static void
2619 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2620         uint64_t idx, int is_sq, uint64_t value)
2621 {
2622         DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2623                 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2624
2625         if (is_sq) {
2626                 if (idx > sc->num_squeues) {
2627                         WPRINTF("%s queue index %lu overflow from "
2628                                  "guest (max %u)",
2629                                  __func__, idx, sc->num_squeues);
2630                         return;
2631                 }
2632
2633                 atomic_store_short(&sc->submit_queues[idx].tail,
2634                                    (uint16_t)value);
2635
2636                 if (idx == 0) {
2637                         pci_nvme_handle_admin_cmd(sc, value);
2638                 } else {
2639                         /* submission queue; handle new entries in SQ */
2640                         if (idx > sc->num_squeues) {
2641                                 WPRINTF("%s SQ index %lu overflow from "
2642                                          "guest (max %u)",
2643                                          __func__, idx, sc->num_squeues);
2644                                 return;
2645                         }
2646                         pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2647                 }
2648         } else {
2649                 if (idx > sc->num_cqueues) {
2650                         WPRINTF("%s queue index %lu overflow from "
2651                                  "guest (max %u)",
2652                                  __func__, idx, sc->num_cqueues);
2653                         return;
2654                 }
2655
2656                 atomic_store_short(&sc->compl_queues[idx].head,
2657                                 (uint16_t)value);
2658         }
2659 }
2660
2661 static void
2662 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2663 {
2664         const char *s = iswrite ? "WRITE" : "READ";
2665
2666         switch (offset) {
2667         case NVME_CR_CAP_LOW:
2668                 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2669                 break;
2670         case NVME_CR_CAP_HI:
2671                 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2672                 break;
2673         case NVME_CR_VS:
2674                 DPRINTF("%s %s NVME_CR_VS", func, s);
2675                 break;
2676         case NVME_CR_INTMS:
2677                 DPRINTF("%s %s NVME_CR_INTMS", func, s);
2678                 break;
2679         case NVME_CR_INTMC:
2680                 DPRINTF("%s %s NVME_CR_INTMC", func, s);
2681                 break;
2682         case NVME_CR_CC:
2683                 DPRINTF("%s %s NVME_CR_CC", func, s);
2684                 break;
2685         case NVME_CR_CSTS:
2686                 DPRINTF("%s %s NVME_CR_CSTS", func, s);
2687                 break;
2688         case NVME_CR_NSSR:
2689                 DPRINTF("%s %s NVME_CR_NSSR", func, s);
2690                 break;
2691         case NVME_CR_AQA:
2692                 DPRINTF("%s %s NVME_CR_AQA", func, s);
2693                 break;
2694         case NVME_CR_ASQ_LOW:
2695                 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2696                 break;
2697         case NVME_CR_ASQ_HI:
2698                 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2699                 break;
2700         case NVME_CR_ACQ_LOW:
2701                 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2702                 break;
2703         case NVME_CR_ACQ_HI:
2704                 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2705                 break;
2706         default:
2707                 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2708         }
2709
2710 }
2711
2712 static void
2713 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2714         uint64_t offset, int size, uint64_t value)
2715 {
2716         uint32_t ccreg;
2717
2718         if (offset >= NVME_DOORBELL_OFFSET) {
2719                 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2720                 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2721                 int is_sq = (belloffset % 8) < 4;
2722
2723                 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2724                         WPRINTF("guest attempted an overflow write offset "
2725                                  "0x%lx, val 0x%lx in %s",
2726                                  offset, value, __func__);
2727                         return;
2728                 }
2729
2730                 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2731                 return;
2732         }
2733
2734         DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2735                 offset, size, value);
2736
2737         if (size != 4) {
2738                 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2739                          "val 0x%lx) to bar0 in %s",
2740                          size, offset, value, __func__);
2741                 /* TODO: shutdown device */
2742                 return;
2743         }
2744
2745         pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2746
2747         pthread_mutex_lock(&sc->mtx);
2748
2749         switch (offset) {
2750         case NVME_CR_CAP_LOW:
2751         case NVME_CR_CAP_HI:
2752                 /* readonly */
2753                 break;
2754         case NVME_CR_VS:
2755                 /* readonly */
2756                 break;
2757         case NVME_CR_INTMS:
2758                 /* MSI-X, so ignore */
2759                 break;
2760         case NVME_CR_INTMC:
2761                 /* MSI-X, so ignore */
2762                 break;
2763         case NVME_CR_CC:
2764                 ccreg = (uint32_t)value;
2765
2766                 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2767                          "iocqes %u",
2768                         __func__,
2769                          NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2770                          NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2771                          NVME_CC_GET_IOCQES(ccreg));
2772
2773                 if (NVME_CC_GET_SHN(ccreg)) {
2774                         /* perform shutdown - flush out data to backend */
2775                         sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2776                             NVME_CSTS_REG_SHST_SHIFT);
2777                         sc->regs.csts |= NVME_SHST_COMPLETE <<
2778                             NVME_CSTS_REG_SHST_SHIFT;
2779                 }
2780                 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2781                         if (NVME_CC_GET_EN(ccreg) == 0)
2782                                 /* transition 1-> causes controller reset */
2783                                 pci_nvme_reset_locked(sc);
2784                         else
2785                                 pci_nvme_init_controller(ctx, sc);
2786                 }
2787
2788                 /* Insert the iocqes, iosqes and en bits from the write */
2789                 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2790                 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2791                 if (NVME_CC_GET_EN(ccreg) == 0) {
2792                         /* Insert the ams, mps and css bit fields */
2793                         sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2794                         sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2795                         sc->regs.csts &= ~NVME_CSTS_RDY;
2796                 } else if (sc->pending_ios == 0) {
2797                         sc->regs.csts |= NVME_CSTS_RDY;
2798                 }
2799                 break;
2800         case NVME_CR_CSTS:
2801                 break;
2802         case NVME_CR_NSSR:
2803                 /* ignore writes; don't support subsystem reset */
2804                 break;
2805         case NVME_CR_AQA:
2806                 sc->regs.aqa = (uint32_t)value;
2807                 break;
2808         case NVME_CR_ASQ_LOW:
2809                 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2810                                (0xFFFFF000 & value);
2811                 break;
2812         case NVME_CR_ASQ_HI:
2813                 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2814                                (value << 32);
2815                 break;
2816         case NVME_CR_ACQ_LOW:
2817                 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2818                                (0xFFFFF000 & value);
2819                 break;
2820         case NVME_CR_ACQ_HI:
2821                 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2822                                (value << 32);
2823                 break;
2824         default:
2825                 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2826                          __func__, offset, value, size);
2827         }
2828         pthread_mutex_unlock(&sc->mtx);
2829 }
2830
2831 static void
2832 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2833                 int baridx, uint64_t offset, int size, uint64_t value)
2834 {
2835         struct pci_nvme_softc* sc = pi->pi_arg;
2836
2837         if (baridx == pci_msix_table_bar(pi) ||
2838             baridx == pci_msix_pba_bar(pi)) {
2839                 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2840                          " value 0x%lx", baridx, offset, size, value);
2841
2842                 pci_emul_msix_twrite(pi, offset, size, value);
2843                 return;
2844         }
2845
2846         switch (baridx) {
2847         case 0:
2848                 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2849                 break;
2850
2851         default:
2852                 DPRINTF("%s unknown baridx %d, val 0x%lx",
2853                          __func__, baridx, value);
2854         }
2855 }
2856
2857 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2858         uint64_t offset, int size)
2859 {
2860         uint64_t value;
2861
2862         pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2863
2864         if (offset < NVME_DOORBELL_OFFSET) {
2865                 void *p = &(sc->regs);
2866                 pthread_mutex_lock(&sc->mtx);
2867                 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2868                 pthread_mutex_unlock(&sc->mtx);
2869         } else {
2870                 value = 0;
2871                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2872         }
2873
2874         switch (size) {
2875         case 1:
2876                 value &= 0xFF;
2877                 break;
2878         case 2:
2879                 value &= 0xFFFF;
2880                 break;
2881         case 4:
2882                 value &= 0xFFFFFFFF;
2883                 break;
2884         }
2885
2886         DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
2887                  offset, size, (uint32_t)value);
2888
2889         return (value);
2890 }
2891
2892
2893
2894 static uint64_t
2895 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2896     uint64_t offset, int size)
2897 {
2898         struct pci_nvme_softc* sc = pi->pi_arg;
2899
2900         if (baridx == pci_msix_table_bar(pi) ||
2901             baridx == pci_msix_pba_bar(pi)) {
2902                 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2903                         baridx, offset, size);
2904
2905                 return pci_emul_msix_tread(pi, offset, size);
2906         }
2907
2908         switch (baridx) {
2909         case 0:
2910                 return pci_nvme_read_bar_0(sc, offset, size);
2911
2912         default:
2913                 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2914         }
2915
2916         return (0);
2917 }
2918
2919 static int
2920 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl)
2921 {
2922         char bident[sizeof("XX:X:X")];
2923         const char *value;
2924         uint32_t sectsz;
2925
2926         sc->max_queues = NVME_QUEUES;
2927         sc->max_qentries = NVME_MAX_QENTRIES;
2928         sc->ioslots = NVME_IOSLOTS;
2929         sc->num_squeues = sc->max_queues;
2930         sc->num_cqueues = sc->max_queues;
2931         sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2932         sectsz = 0;
2933         snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2934                  "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2935
2936         value = get_config_value_node(nvl, "maxq");
2937         if (value != NULL)
2938                 sc->max_queues = atoi(value);
2939         value = get_config_value_node(nvl, "qsz");
2940         if (value != NULL) {
2941                 sc->max_qentries = atoi(value);
2942                 if (sc->max_qentries <= 0) {
2943                         EPRINTLN("nvme: Invalid qsz option %d",
2944                             sc->max_qentries);
2945                         return (-1);
2946                 }
2947         }
2948         value = get_config_value_node(nvl, "ioslots");
2949         if (value != NULL) {
2950                 sc->ioslots = atoi(value);
2951                 if (sc->ioslots <= 0) {
2952                         EPRINTLN("Invalid ioslots option %d", sc->ioslots);
2953                         return (-1);
2954                 }
2955         }
2956         value = get_config_value_node(nvl, "sectsz");
2957         if (value != NULL)
2958                 sectsz = atoi(value);
2959         value = get_config_value_node(nvl, "ser");
2960         if (value != NULL) {
2961                 /*
2962                  * This field indicates the Product Serial Number in
2963                  * 7-bit ASCII, unused bytes should be space characters.
2964                  * Ref: NVMe v1.3c.
2965                  */
2966                 cpywithpad((char *)sc->ctrldata.sn,
2967                     sizeof(sc->ctrldata.sn), value, ' ');
2968         }
2969         value = get_config_value_node(nvl, "eui64");
2970         if (value != NULL)
2971                 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0));
2972         value = get_config_value_node(nvl, "dsm");
2973         if (value != NULL) {
2974                 if (strcmp(value, "auto") == 0)
2975                         sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2976                 else if (strcmp(value, "enable") == 0)
2977                         sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2978                 else if (strcmp(value, "disable") == 0)
2979                         sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2980         }
2981
2982         value = get_config_value_node(nvl, "ram");
2983         if (value != NULL) {
2984                 uint64_t sz = strtoull(value, NULL, 10);
2985
2986                 sc->nvstore.type = NVME_STOR_RAM;
2987                 sc->nvstore.size = sz * 1024 * 1024;
2988                 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2989                 sc->nvstore.sectsz = 4096;
2990                 sc->nvstore.sectsz_bits = 12;
2991                 if (sc->nvstore.ctx == NULL) {
2992                         EPRINTLN("nvme: Unable to allocate RAM");
2993                         return (-1);
2994                 }
2995         } else {
2996                 snprintf(bident, sizeof(bident), "%d:%d",
2997                     sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2998                 sc->nvstore.ctx = blockif_open(nvl, bident);
2999                 if (sc->nvstore.ctx == NULL) {
3000                         EPRINTLN("nvme: Could not open backing file: %s",
3001                             strerror(errno));
3002                         return (-1);
3003                 }
3004                 sc->nvstore.type = NVME_STOR_BLOCKIF;
3005                 sc->nvstore.size = blockif_size(sc->nvstore.ctx);
3006         }
3007
3008         if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
3009                 sc->nvstore.sectsz = sectsz;
3010         else if (sc->nvstore.type != NVME_STOR_RAM)
3011                 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
3012         for (sc->nvstore.sectsz_bits = 9;
3013              (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
3014              sc->nvstore.sectsz_bits++);
3015
3016         if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
3017                 sc->max_queues = NVME_QUEUES;
3018
3019         return (0);
3020 }
3021
3022 static void
3023 pci_nvme_resized(struct blockif_ctxt *bctxt, void *arg, size_t new_size)
3024 {
3025         struct pci_nvme_softc *sc;
3026         struct pci_nvme_blockstore *nvstore;
3027         struct nvme_namespace_data *nd;
3028
3029         sc = arg;
3030         nvstore = &sc->nvstore;
3031         nd = &sc->nsdata;
3032
3033         nvstore->size = new_size;
3034         pci_nvme_init_nsdata_size(nvstore, nd);
3035
3036         /* Add changed NSID to list */
3037         sc->ns_log.ns[0] = 1;
3038         sc->ns_log.ns[1] = 0;
3039
3040         pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE,
3041             PCI_NVME_AE_INFO_NS_ATTR_CHANGED);
3042 }
3043
3044 static int
3045 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
3046 {
3047         struct pci_nvme_softc *sc;
3048         uint32_t pci_membar_sz;
3049         int     error;
3050
3051         error = 0;
3052
3053         sc = calloc(1, sizeof(struct pci_nvme_softc));
3054         pi->pi_arg = sc;
3055         sc->nsc_pi = pi;
3056
3057         error = pci_nvme_parse_config(sc, nvl);
3058         if (error < 0)
3059                 goto done;
3060         else
3061                 error = 0;
3062
3063         STAILQ_INIT(&sc->ioreqs_free);
3064         sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
3065         for (int i = 0; i < sc->ioslots; i++) {
3066                 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
3067         }
3068
3069         pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
3070         pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
3071         pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
3072         pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
3073         pci_set_cfgdata8(pi, PCIR_PROGIF,
3074                          PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
3075
3076         /*
3077          * Allocate size of NVMe registers + doorbell space for all queues.
3078          *
3079          * The specification requires a minimum memory I/O window size of 16K.
3080          * The Windows driver will refuse to start a device with a smaller
3081          * window.
3082          */
3083         pci_membar_sz = sizeof(struct nvme_registers) +
3084             2 * sizeof(uint32_t) * (sc->max_queues + 1);
3085         pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
3086
3087         DPRINTF("nvme membar size: %u", pci_membar_sz);
3088
3089         error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
3090         if (error) {
3091                 WPRINTF("%s pci alloc mem bar failed", __func__);
3092                 goto done;
3093         }
3094
3095         error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
3096         if (error) {
3097                 WPRINTF("%s pci add msixcap failed", __func__);
3098                 goto done;
3099         }
3100
3101         error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
3102         if (error) {
3103                 WPRINTF("%s pci add Express capability failed", __func__);
3104                 goto done;
3105         }
3106
3107         pthread_mutex_init(&sc->mtx, NULL);
3108         sem_init(&sc->iosemlock, 0, sc->ioslots);
3109         blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc);
3110
3111         pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
3112         /*
3113          * Controller data depends on Namespace data so initialize Namespace
3114          * data first.
3115          */
3116         pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
3117         pci_nvme_init_ctrldata(sc);
3118         pci_nvme_init_logpages(sc);
3119         pci_nvme_init_features(sc);
3120
3121         pci_nvme_aer_init(sc);
3122         pci_nvme_aen_init(sc);
3123
3124         pci_nvme_reset(sc);
3125
3126         pci_lintr_request(pi);
3127
3128 done:
3129         return (error);
3130 }
3131
3132 static int
3133 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts)
3134 {
3135         char *cp, *ram;
3136
3137         if (opts == NULL)
3138                 return (0);
3139
3140         if (strncmp(opts, "ram=", 4) == 0) {
3141                 cp = strchr(opts, ',');
3142                 if (cp == NULL) {
3143                         set_config_value_node(nvl, "ram", opts + 4);
3144                         return (0);
3145                 }
3146                 ram = strndup(opts + 4, cp - opts - 4);
3147                 set_config_value_node(nvl, "ram", ram);
3148                 free(ram);
3149                 return (pci_parse_legacy_config(nvl, cp + 1));
3150         } else
3151                 return (blockif_legacy_config(nvl, opts));
3152 }
3153
3154 struct pci_devemu pci_de_nvme = {
3155         .pe_emu =       "nvme",
3156         .pe_init =      pci_nvme_init,
3157         .pe_legacy_config = pci_nvme_legacy_config,
3158         .pe_barwrite =  pci_nvme_write,
3159         .pe_barread =   pci_nvme_read
3160 };
3161 PCI_EMUL_SET(pci_de_nvme);