]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/bhyve/pci_nvme.c
12.3: update to RC2
[FreeBSD/FreeBSD.git] / usr.sbin / bhyve / pci_nvme.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov 
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65
66 #include <assert.h>
67 #include <pthread.h>
68 #include <semaphore.h>
69 #include <stdbool.h>
70 #include <stddef.h>
71 #include <stdint.h>
72 #include <stdio.h>
73 #include <stdlib.h>
74 #include <string.h>
75
76 #include <machine/atomic.h>
77 #include <machine/vmm.h>
78 #include <vmmapi.h>
79
80 #include <dev/nvme/nvme.h>
81
82 #include "bhyverun.h"
83 #include "block_if.h"
84 #include "debug.h"
85 #include "pci_emul.h"
86
87
88 static int nvme_debug = 0;
89 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
90 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
91
92 /* defaults; can be overridden */
93 #define NVME_MSIX_BAR           4
94
95 #define NVME_IOSLOTS            8
96
97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
98 #define NVME_MMIO_SPACE_MIN     (1 << 14)
99
100 #define NVME_QUEUES             16
101 #define NVME_MAX_QENTRIES       2048
102 /* Memory Page size Minimum reported in CAP register */
103 #define NVME_MPSMIN             0
104 /* MPSMIN converted to bytes */
105 #define NVME_MPSMIN_BYTES       (1 << (12 + NVME_MPSMIN))
106
107 #define NVME_PRP2_ITEMS         (PAGE_SIZE/sizeof(uint64_t))
108 #define NVME_MDTS               9
109 /* Note the + 1 allows for the initial descriptor to not be page aligned */
110 #define NVME_MAX_IOVEC          ((1 << NVME_MDTS) + 1)
111 #define NVME_MAX_DATA_SIZE      ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
112
113 /* This is a synthetic status code to indicate there is no status */
114 #define NVME_NO_STATUS          0xffff
115 #define NVME_COMPLETION_VALID(c)        ((c).status != NVME_NO_STATUS)
116
117 /* helpers */
118
119 /* Convert a zero-based value into a one-based value */
120 #define ONE_BASED(zero)         ((zero) + 1)
121 /* Convert a one-based value into a zero-based value */
122 #define ZERO_BASED(one)         ((one)  - 1)
123
124 /* Encode number of SQ's and CQ's for Set/Get Features */
125 #define NVME_FEATURE_NUM_QUEUES(sc) \
126         (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
127         (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
128
129 #define NVME_DOORBELL_OFFSET    offsetof(struct nvme_registers, doorbell)
130
131 enum nvme_controller_register_offsets {
132         NVME_CR_CAP_LOW = 0x00,
133         NVME_CR_CAP_HI  = 0x04,
134         NVME_CR_VS      = 0x08,
135         NVME_CR_INTMS   = 0x0c,
136         NVME_CR_INTMC   = 0x10,
137         NVME_CR_CC      = 0x14,
138         NVME_CR_CSTS    = 0x1c,
139         NVME_CR_NSSR    = 0x20,
140         NVME_CR_AQA     = 0x24,
141         NVME_CR_ASQ_LOW = 0x28,
142         NVME_CR_ASQ_HI  = 0x2c,
143         NVME_CR_ACQ_LOW = 0x30,
144         NVME_CR_ACQ_HI  = 0x34,
145 };
146
147 enum nvme_cmd_cdw11 {
148         NVME_CMD_CDW11_PC  = 0x0001,
149         NVME_CMD_CDW11_IEN = 0x0002,
150         NVME_CMD_CDW11_IV  = 0xFFFF0000,
151 };
152
153 enum nvme_copy_dir {
154         NVME_COPY_TO_PRP,
155         NVME_COPY_FROM_PRP,
156 };
157
158 #define NVME_CQ_INTEN   0x01
159 #define NVME_CQ_INTCOAL 0x02
160
161 struct nvme_completion_queue {
162         struct nvme_completion *qbase;
163         pthread_mutex_t mtx;
164         uint32_t        size;
165         uint16_t        tail; /* nvme progress */
166         uint16_t        head; /* guest progress */
167         uint16_t        intr_vec;
168         uint32_t        intr_en;
169 };
170
171 struct nvme_submission_queue {
172         struct nvme_command *qbase;
173         pthread_mutex_t mtx;
174         uint32_t        size;
175         uint16_t        head; /* nvme progress */
176         uint16_t        tail; /* guest progress */
177         uint16_t        cqid; /* completion queue id */
178         int             qpriority;
179 };
180
181 enum nvme_storage_type {
182         NVME_STOR_BLOCKIF = 0,
183         NVME_STOR_RAM = 1,
184 };
185
186 struct pci_nvme_blockstore {
187         enum nvme_storage_type type;
188         void            *ctx;
189         uint64_t        size;
190         uint32_t        sectsz;
191         uint32_t        sectsz_bits;
192         uint64_t        eui64;
193         uint32_t        deallocate:1;
194 };
195
196 /*
197  * Calculate the number of additional page descriptors for guest IO requests
198  * based on the advertised Max Data Transfer (MDTS) and given the number of
199  * default iovec's in a struct blockif_req.
200  *
201  * Note the + 1 allows for the initial descriptor to not be page aligned.
202  */
203 #define MDTS_PAD_SIZE \
204         NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
205         NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
206         0
207
208 struct pci_nvme_ioreq {
209         struct pci_nvme_softc *sc;
210         STAILQ_ENTRY(pci_nvme_ioreq) link;
211         struct nvme_submission_queue *nvme_sq;
212         uint16_t        sqid;
213
214         /* command information */
215         uint16_t        opc;
216         uint16_t        cid;
217         uint32_t        nsid;
218
219         uint64_t        prev_gpaddr;
220         size_t          prev_size;
221         size_t          bytes;
222
223         struct blockif_req io_req;
224
225         struct iovec    iovpadding[MDTS_PAD_SIZE];
226 };
227
228 enum nvme_dsm_type {
229         /* Dataset Management bit in ONCS reflects backing storage capability */
230         NVME_DATASET_MANAGEMENT_AUTO,
231         /* Unconditionally set Dataset Management bit in ONCS */
232         NVME_DATASET_MANAGEMENT_ENABLE,
233         /* Unconditionally clear Dataset Management bit in ONCS */
234         NVME_DATASET_MANAGEMENT_DISABLE,
235 };
236
237 struct pci_nvme_softc;
238 struct nvme_feature_obj;
239
240 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
241     struct nvme_feature_obj *,
242     struct nvme_command *,
243     struct nvme_completion *);
244
245 struct nvme_feature_obj {
246         uint32_t        cdw11;
247         nvme_feature_cb set;
248         nvme_feature_cb get;
249         bool namespace_specific;
250 };
251
252 #define NVME_FID_MAX            (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
253
254 struct pci_nvme_aer {
255         STAILQ_ENTRY(pci_nvme_aer) link;
256         uint16_t        cid;    /* Command ID of the submitted AER */
257 };
258
259 struct pci_nvme_softc {
260         struct pci_devinst *nsc_pi;
261
262         pthread_mutex_t mtx;
263
264         struct nvme_registers regs;
265
266         struct nvme_namespace_data  nsdata;
267         struct nvme_controller_data ctrldata;
268         struct nvme_error_information_entry err_log;
269         struct nvme_health_information_page health_log;
270         struct nvme_firmware_page fw_log;
271
272         struct pci_nvme_blockstore nvstore;
273
274         uint16_t        max_qentries;   /* max entries per queue */
275         uint32_t        max_queues;     /* max number of IO SQ's or CQ's */
276         uint32_t        num_cqueues;
277         uint32_t        num_squeues;
278         bool            num_q_is_set; /* Has host set Number of Queues */
279
280         struct pci_nvme_ioreq *ioreqs;
281         STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
282         uint32_t        pending_ios;
283         uint32_t        ioslots;
284         sem_t           iosemlock;
285
286         /*
287          * Memory mapped Submission and Completion queues
288          * Each array includes both Admin and IO queues
289          */
290         struct nvme_completion_queue *compl_queues;
291         struct nvme_submission_queue *submit_queues;
292
293         struct nvme_feature_obj feat[NVME_FID_MAX];
294
295         enum nvme_dsm_type dataset_management;
296
297         /* Accounting for SMART data */
298         __uint128_t     read_data_units;
299         __uint128_t     write_data_units;
300         __uint128_t     read_commands;
301         __uint128_t     write_commands;
302         uint32_t        read_dunits_remainder;
303         uint32_t        write_dunits_remainder;
304
305         STAILQ_HEAD(, pci_nvme_aer) aer_list;
306         uint32_t        aer_count;
307 };
308
309
310 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
311 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
312 static void pci_nvme_io_done(struct blockif_req *, int);
313
314 /* Controller Configuration utils */
315 #define NVME_CC_GET_EN(cc) \
316         ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
317 #define NVME_CC_GET_CSS(cc) \
318         ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
319 #define NVME_CC_GET_SHN(cc) \
320         ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
321 #define NVME_CC_GET_IOSQES(cc) \
322         ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
323 #define NVME_CC_GET_IOCQES(cc) \
324         ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
325
326 #define NVME_CC_WRITE_MASK \
327         ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
328          (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
329          (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
330
331 #define NVME_CC_NEN_WRITE_MASK \
332         ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
333          (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
334          (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
335
336 /* Controller Status utils */
337 #define NVME_CSTS_GET_RDY(sts) \
338         ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
339
340 #define NVME_CSTS_RDY   (1 << NVME_CSTS_REG_RDY_SHIFT)
341
342 /* Completion Queue status word utils */
343 #define NVME_STATUS_P   (1 << NVME_STATUS_P_SHIFT)
344 #define NVME_STATUS_MASK \
345         ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
346          (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
347
348 #define NVME_ONCS_DSM   (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
349         NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
350
351 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
352     struct nvme_feature_obj *,
353     struct nvme_command *,
354     struct nvme_completion *);
355 static void nvme_feature_num_queues(struct pci_nvme_softc *,
356     struct nvme_feature_obj *,
357     struct nvme_command *,
358     struct nvme_completion *);
359 static void nvme_feature_iv_config(struct pci_nvme_softc *,
360     struct nvme_feature_obj *,
361     struct nvme_command *,
362     struct nvme_completion *);
363
364 static __inline void
365 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
366 {
367         size_t len;
368
369         len = strnlen(src, dst_size);
370         memset(dst, pad, dst_size);
371         memcpy(dst, src, len);
372 }
373
374 static __inline void
375 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
376 {
377
378         *status &= ~NVME_STATUS_MASK;
379         *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
380                 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
381 }
382
383 static __inline void
384 pci_nvme_status_genc(uint16_t *status, uint16_t code)
385 {
386
387         pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
388 }
389
390 /*
391  * Initialize the requested number or IO Submission and Completion Queues.
392  * Admin queues are allocated implicitly.
393  */
394 static void
395 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
396 {
397         uint32_t i;
398
399         /*
400          * Allocate and initialize the Submission Queues
401          */
402         if (nsq > NVME_QUEUES) {
403                 WPRINTF("%s: clamping number of SQ from %u to %u",
404                                         __func__, nsq, NVME_QUEUES);
405                 nsq = NVME_QUEUES;
406         }
407
408         sc->num_squeues = nsq;
409
410         sc->submit_queues = calloc(sc->num_squeues + 1,
411                                 sizeof(struct nvme_submission_queue));
412         if (sc->submit_queues == NULL) {
413                 WPRINTF("%s: SQ allocation failed", __func__);
414                 sc->num_squeues = 0;
415         } else {
416                 struct nvme_submission_queue *sq = sc->submit_queues;
417
418                 for (i = 0; i < sc->num_squeues; i++)
419                         pthread_mutex_init(&sq[i].mtx, NULL);
420         }
421
422         /*
423          * Allocate and initialize the Completion Queues
424          */
425         if (ncq > NVME_QUEUES) {
426                 WPRINTF("%s: clamping number of CQ from %u to %u",
427                                         __func__, ncq, NVME_QUEUES);
428                 ncq = NVME_QUEUES;
429         }
430
431         sc->num_cqueues = ncq;
432
433         sc->compl_queues = calloc(sc->num_cqueues + 1,
434                                 sizeof(struct nvme_completion_queue));
435         if (sc->compl_queues == NULL) {
436                 WPRINTF("%s: CQ allocation failed", __func__);
437                 sc->num_cqueues = 0;
438         } else {
439                 struct nvme_completion_queue *cq = sc->compl_queues;
440
441                 for (i = 0; i < sc->num_cqueues; i++)
442                         pthread_mutex_init(&cq[i].mtx, NULL);
443         }
444 }
445
446 static void
447 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
448 {
449         struct nvme_controller_data *cd = &sc->ctrldata;
450
451         cd->vid = 0xFB5D;
452         cd->ssvid = 0x0000;
453
454         cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
455         cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
456
457         /* Num of submission commands that we can handle at a time (2^rab) */
458         cd->rab   = 4;
459
460         /* FreeBSD OUI */
461         cd->ieee[0] = 0x58;
462         cd->ieee[1] = 0x9c;
463         cd->ieee[2] = 0xfc;
464
465         cd->mic = 0;
466
467         cd->mdts = NVME_MDTS;   /* max data transfer size (2^mdts * CAP.MPSMIN) */
468
469         cd->ver = 0x00010300;
470
471         cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
472         cd->acl = 2;
473         cd->aerl = 4;
474
475         /* Advertise 1, Read-only firmware slot */
476         cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
477             (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
478         cd->lpa = 0;    /* TODO: support some simple things like SMART */
479         cd->elpe = 0;   /* max error log page entries */
480         cd->npss = 1;   /* number of power states support */
481
482         /* Warning Composite Temperature Threshold */
483         cd->wctemp = 0x0157;
484
485         cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
486             (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
487         cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
488             (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
489         cd->nn = 1;     /* number of namespaces */
490
491         cd->oncs = 0;
492         switch (sc->dataset_management) {
493         case NVME_DATASET_MANAGEMENT_AUTO:
494                 if (sc->nvstore.deallocate)
495                         cd->oncs |= NVME_ONCS_DSM;
496                 break;
497         case NVME_DATASET_MANAGEMENT_ENABLE:
498                 cd->oncs |= NVME_ONCS_DSM;
499                 break;
500         default:
501                 break;
502         }
503
504         cd->fna = 0x03;
505
506         cd->power_state[0].mp = 10;
507 }
508
509 /*
510  * Calculate the CRC-16 of the given buffer
511  * See copyright attribution at top of file
512  */
513 static uint16_t
514 crc16(uint16_t crc, const void *buffer, unsigned int len)
515 {
516         const unsigned char *cp = buffer;
517         /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
518         static uint16_t const crc16_table[256] = {
519                 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
520                 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
521                 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
522                 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
523                 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
524                 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
525                 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
526                 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
527                 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
528                 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
529                 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
530                 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
531                 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
532                 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
533                 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
534                 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
535                 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
536                 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
537                 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
538                 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
539                 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
540                 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
541                 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
542                 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
543                 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
544                 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
545                 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
546                 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
547                 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
548                 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
549                 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
550                 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
551         };
552
553         while (len--)
554                 crc = (((crc >> 8) & 0xffU) ^
555                     crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
556         return crc;
557 }
558
559 static void
560 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
561     struct nvme_namespace_data *nd, uint32_t nsid,
562     struct pci_nvme_blockstore *nvstore)
563 {
564
565         /* Get capacity and block size information from backing store */
566         nd->nsze = nvstore->size / nvstore->sectsz;
567         nd->ncap = nd->nsze;
568         nd->nuse = nd->nsze;
569
570         if (nvstore->type == NVME_STOR_BLOCKIF)
571                 nvstore->deallocate = blockif_candelete(nvstore->ctx);
572
573         nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
574         nd->flbas = 0;
575
576         /* Create an EUI-64 if user did not provide one */
577         if (nvstore->eui64 == 0) {
578                 char *data = NULL;
579                 uint64_t eui64 = nvstore->eui64;
580
581                 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
582                     sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
583
584                 if (data != NULL) {
585                         eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
586                         free(data);
587                 }
588                 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
589         }
590         be64enc(nd->eui64, nvstore->eui64);
591
592         /* LBA data-sz = 2^lbads */
593         nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
594 }
595
596 static void
597 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
598 {
599
600         memset(&sc->err_log, 0, sizeof(sc->err_log));
601         memset(&sc->health_log, 0, sizeof(sc->health_log));
602         memset(&sc->fw_log, 0, sizeof(sc->fw_log));
603
604         /* Set read/write remainder to round up according to spec */
605         sc->read_dunits_remainder = 999;
606         sc->write_dunits_remainder = 999;
607
608         /* Set nominal Health values checked by implementations */
609         sc->health_log.temperature = 310;
610         sc->health_log.available_spare = 100;
611         sc->health_log.available_spare_threshold = 10;
612 }
613
614 static void
615 pci_nvme_init_features(struct pci_nvme_softc *sc)
616 {
617
618         sc->feat[0].set = nvme_feature_invalid_cb;
619         sc->feat[0].get = nvme_feature_invalid_cb;
620
621         sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
622         sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
623         sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
624         sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set =
625             nvme_feature_iv_config;
626         sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get =
627             nvme_feature_invalid_cb;
628         sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get =
629             nvme_feature_invalid_cb;
630 }
631
632 static void
633 pci_nvme_aer_init(struct pci_nvme_softc *sc)
634 {
635
636         STAILQ_INIT(&sc->aer_list);
637         sc->aer_count = 0;
638 }
639
640 static void
641 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
642 {
643         struct pci_nvme_aer *aer = NULL;
644
645         while (!STAILQ_EMPTY(&sc->aer_list)) {
646                 aer = STAILQ_FIRST(&sc->aer_list);
647                 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
648                 free(aer);
649         }
650
651         pci_nvme_aer_init(sc);
652 }
653
654 static bool
655 pci_nvme_aer_available(struct pci_nvme_softc *sc)
656 {
657
658         return (!STAILQ_EMPTY(&sc->aer_list));
659 }
660
661 static bool
662 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
663 {
664         struct nvme_controller_data *cd = &sc->ctrldata;
665
666         /* AERL is a zero based value while aer_count is one's based */
667         return (sc->aer_count == (cd->aerl + 1));
668 }
669
670 /*
671  * Add an Async Event Request
672  *
673  * Stores an AER to be returned later if the Controller needs to notify the
674  * host of an event.
675  * Note that while the NVMe spec doesn't require Controllers to return AER's
676  * in order, this implementation does preserve the order.
677  */
678 static int
679 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
680 {
681         struct pci_nvme_aer *aer = NULL;
682
683         if (pci_nvme_aer_limit_reached(sc))
684                 return (-1);
685
686         aer = calloc(1, sizeof(struct pci_nvme_aer));
687         if (aer == NULL)
688                 return (-1);
689
690         sc->aer_count++;
691
692         /* Save the Command ID for use in the completion message */
693         aer->cid = cid;
694         STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
695
696         return (0);
697 }
698
699 /*
700  * Get an Async Event Request structure
701  *
702  * Returns a pointer to an AER previously submitted by the host or NULL if
703  * no AER's exist. Caller is responsible for freeing the returned struct.
704  */
705 static struct pci_nvme_aer *
706 pci_nvme_aer_get(struct pci_nvme_softc *sc)
707 {
708         struct pci_nvme_aer *aer = NULL;
709
710         aer = STAILQ_FIRST(&sc->aer_list);
711         if (aer != NULL) {
712                 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
713                 sc->aer_count--;
714         }
715         
716         return (aer);
717 }
718
719 static void
720 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
721 {
722         uint32_t i;
723
724         DPRINTF("%s", __func__);
725
726         sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
727             (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
728             (60 << NVME_CAP_LO_REG_TO_SHIFT);
729
730         sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
731
732         sc->regs.vs = 0x00010300;       /* NVMe v1.3 */
733
734         sc->regs.cc = 0;
735         sc->regs.csts = 0;
736
737         assert(sc->submit_queues != NULL);
738
739         for (i = 0; i < sc->num_squeues + 1; i++) {
740                 sc->submit_queues[i].qbase = NULL;
741                 sc->submit_queues[i].size = 0;
742                 sc->submit_queues[i].cqid = 0;
743                 sc->submit_queues[i].tail = 0;
744                 sc->submit_queues[i].head = 0;
745         }
746
747         assert(sc->compl_queues != NULL);
748
749         for (i = 0; i < sc->num_cqueues + 1; i++) {
750                 sc->compl_queues[i].qbase = NULL;
751                 sc->compl_queues[i].size = 0;
752                 sc->compl_queues[i].tail = 0;
753                 sc->compl_queues[i].head = 0;
754         }
755
756         sc->num_q_is_set = false;
757
758         pci_nvme_aer_destroy(sc);
759 }
760
761 static void
762 pci_nvme_reset(struct pci_nvme_softc *sc)
763 {
764         pthread_mutex_lock(&sc->mtx);
765         pci_nvme_reset_locked(sc);
766         pthread_mutex_unlock(&sc->mtx);
767 }
768
769 static void
770 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
771 {
772         uint16_t acqs, asqs;
773
774         DPRINTF("%s", __func__);
775
776         asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
777         sc->submit_queues[0].size = asqs;
778         sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
779                     sizeof(struct nvme_command) * asqs);
780
781         DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
782                 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
783
784         acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 
785             NVME_AQA_REG_ACQS_MASK) + 1;
786         sc->compl_queues[0].size = acqs;
787         sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
788                  sizeof(struct nvme_completion) * acqs);
789         sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
790
791         DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
792                 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
793 }
794
795 static int
796 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
797         size_t len, enum nvme_copy_dir dir)
798 {
799         uint8_t *p;
800         size_t bytes;
801
802         if (len > (8 * 1024)) {
803                 return (-1);
804         }
805
806         /* Copy from the start of prp1 to the end of the physical page */
807         bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
808         bytes = MIN(bytes, len);
809
810         p = vm_map_gpa(ctx, prp1, bytes);
811         if (p == NULL) {
812                 return (-1);
813         }
814
815         if (dir == NVME_COPY_TO_PRP)
816                 memcpy(p, b, bytes);
817         else
818                 memcpy(b, p, bytes);
819
820         b += bytes;
821
822         len -= bytes;
823         if (len == 0) {
824                 return (0);
825         }
826
827         len = MIN(len, PAGE_SIZE);
828
829         p = vm_map_gpa(ctx, prp2, len);
830         if (p == NULL) {
831                 return (-1);
832         }
833
834         if (dir == NVME_COPY_TO_PRP)
835                 memcpy(p, b, len);
836         else
837                 memcpy(b, p, len);
838
839         return (0);
840 }
841
842 /*
843  * Write a Completion Queue Entry update
844  *
845  * Write the completion and update the doorbell value
846  */
847 static void
848 pci_nvme_cq_update(struct pci_nvme_softc *sc,
849                 struct nvme_completion_queue *cq,
850                 uint32_t cdw0,
851                 uint16_t cid,
852                 uint16_t sqid,
853                 uint16_t status)
854 {
855         struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
856         struct nvme_completion *cqe;
857
858         assert(cq->qbase != NULL);
859
860         pthread_mutex_lock(&cq->mtx);
861
862         cqe = &cq->qbase[cq->tail];
863
864         /* Flip the phase bit */
865         status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
866
867         cqe->cdw0 = cdw0;
868         cqe->sqhd = sq->head;
869         cqe->sqid = sqid;
870         cqe->cid = cid;
871         cqe->status = status;
872
873         cq->tail++;
874         if (cq->tail >= cq->size) {
875                 cq->tail = 0;
876         }
877
878         pthread_mutex_unlock(&cq->mtx);
879 }
880
881 static int
882 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
883         struct nvme_completion* compl)
884 {
885         uint16_t qid = command->cdw10 & 0xffff;
886
887         DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
888         if (qid == 0 || qid > sc->num_squeues ||
889             (sc->submit_queues[qid].qbase == NULL)) {
890                 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
891                         __func__, qid, sc->num_squeues);
892                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
893                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
894                 return (1);
895         }
896
897         sc->submit_queues[qid].qbase = NULL;
898         sc->submit_queues[qid].cqid = 0;
899         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
900         return (1);
901 }
902
903 static int
904 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
905         struct nvme_completion* compl)
906 {
907         if (command->cdw11 & NVME_CMD_CDW11_PC) {
908                 uint16_t qid = command->cdw10 & 0xffff;
909                 struct nvme_submission_queue *nsq;
910
911                 if ((qid == 0) || (qid > sc->num_squeues) ||
912                     (sc->submit_queues[qid].qbase != NULL)) {
913                         WPRINTF("%s queue index %u > num_squeues %u",
914                                 __func__, qid, sc->num_squeues);
915                         pci_nvme_status_tc(&compl->status,
916                             NVME_SCT_COMMAND_SPECIFIC,
917                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
918                         return (1);
919                 }
920
921                 nsq = &sc->submit_queues[qid];
922                 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
923                 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
924                 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
925                         /*
926                          * Queues must specify at least two entries
927                          * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
928                          * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
929                          */
930                         pci_nvme_status_tc(&compl->status,
931                             NVME_SCT_COMMAND_SPECIFIC,
932                             NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
933                         return (1);
934                 }
935
936                 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
937                 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
938                         pci_nvme_status_tc(&compl->status,
939                             NVME_SCT_COMMAND_SPECIFIC,
940                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
941                         return (1);
942                 }
943
944                 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
945                         pci_nvme_status_tc(&compl->status,
946                             NVME_SCT_COMMAND_SPECIFIC,
947                             NVME_SC_COMPLETION_QUEUE_INVALID);
948                         return (1);
949                 }
950
951                 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
952
953                 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
954                               sizeof(struct nvme_command) * (size_t)nsq->size);
955
956                 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
957                         qid, nsq->size, nsq->qbase, nsq->cqid);
958
959                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
960
961                 DPRINTF("%s completed creating IOSQ qid %u",
962                          __func__, qid);
963         } else {
964                 /* 
965                  * Guest sent non-cont submission queue request.
966                  * This setting is unsupported by this emulation.
967                  */
968                 WPRINTF("%s unsupported non-contig (list-based) "
969                          "create i/o submission queue", __func__);
970
971                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
972         }
973         return (1);
974 }
975
976 static int
977 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
978         struct nvme_completion* compl)
979 {
980         uint16_t qid = command->cdw10 & 0xffff;
981         uint16_t sqid;
982
983         DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
984         if (qid == 0 || qid > sc->num_cqueues ||
985             (sc->compl_queues[qid].qbase == NULL)) {
986                 WPRINTF("%s queue index %u / num_cqueues %u",
987                         __func__, qid, sc->num_cqueues);
988                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
989                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
990                 return (1);
991         }
992
993         /* Deleting an Active CQ is an error */
994         for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
995                 if (sc->submit_queues[sqid].cqid == qid) {
996                         pci_nvme_status_tc(&compl->status,
997                             NVME_SCT_COMMAND_SPECIFIC,
998                             NVME_SC_INVALID_QUEUE_DELETION);
999                         return (1);
1000                 }
1001
1002         sc->compl_queues[qid].qbase = NULL;
1003         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1004         return (1);
1005 }
1006
1007 static int
1008 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1009         struct nvme_completion* compl)
1010 {
1011         struct nvme_completion_queue *ncq;
1012         uint16_t qid = command->cdw10 & 0xffff;
1013
1014         /* Only support Physically Contiguous queues */
1015         if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1016                 WPRINTF("%s unsupported non-contig (list-based) "
1017                          "create i/o completion queue",
1018                          __func__);
1019
1020                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1021                 return (1);
1022         }
1023
1024         if ((qid == 0) || (qid > sc->num_cqueues) ||
1025             (sc->compl_queues[qid].qbase != NULL)) {
1026                 WPRINTF("%s queue index %u > num_cqueues %u",
1027                         __func__, qid, sc->num_cqueues);
1028                 pci_nvme_status_tc(&compl->status,
1029                     NVME_SCT_COMMAND_SPECIFIC,
1030                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
1031                 return (1);
1032         }
1033
1034         ncq = &sc->compl_queues[qid];
1035         ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1036         ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1037         if (ncq->intr_vec > (sc->max_queues + 1)) {
1038                 pci_nvme_status_tc(&compl->status,
1039                     NVME_SCT_COMMAND_SPECIFIC,
1040                     NVME_SC_INVALID_INTERRUPT_VECTOR);
1041                 return (1);
1042         }
1043
1044         ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1045         if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1046                 /*
1047                  * Queues must specify at least two entries
1048                  * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1049                  * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1050                  */
1051                 pci_nvme_status_tc(&compl->status,
1052                     NVME_SCT_COMMAND_SPECIFIC,
1053                     NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1054                 return (1);
1055         }
1056         ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1057                      command->prp1,
1058                      sizeof(struct nvme_command) * (size_t)ncq->size);
1059
1060         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1061
1062
1063         return (1);
1064 }
1065
1066 static int
1067 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1068         struct nvme_completion* compl)
1069 {
1070         uint32_t logsize;
1071         uint8_t logpage = command->cdw10 & 0xFF;
1072
1073         DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1074
1075         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1076
1077         /*
1078          * Command specifies the number of dwords to return in fields NUMDU
1079          * and NUMDL. This is a zero-based value.
1080          */
1081         logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1082         logsize *= sizeof(uint32_t);
1083
1084         switch (logpage) {
1085         case NVME_LOG_ERROR:
1086                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1087                     command->prp2, (uint8_t *)&sc->err_log,
1088                     MIN(logsize, sizeof(sc->err_log)),
1089                     NVME_COPY_TO_PRP);
1090                 break;
1091         case NVME_LOG_HEALTH_INFORMATION:
1092                 pthread_mutex_lock(&sc->mtx);
1093                 memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1094                     sizeof(sc->health_log.data_units_read));
1095                 memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1096                     sizeof(sc->health_log.data_units_written));
1097                 memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1098                     sizeof(sc->health_log.host_read_commands));
1099                 memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1100                     sizeof(sc->health_log.host_write_commands));
1101                 pthread_mutex_unlock(&sc->mtx);
1102
1103                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1104                     command->prp2, (uint8_t *)&sc->health_log,
1105                     MIN(logsize, sizeof(sc->health_log)),
1106                     NVME_COPY_TO_PRP);
1107                 break;
1108         case NVME_LOG_FIRMWARE_SLOT:
1109                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1110                     command->prp2, (uint8_t *)&sc->fw_log,
1111                     MIN(logsize, sizeof(sc->fw_log)),
1112                     NVME_COPY_TO_PRP);
1113                 break;
1114         default:
1115                 DPRINTF("%s get log page %x command not supported",
1116                         __func__, logpage);
1117
1118                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1119                     NVME_SC_INVALID_LOG_PAGE);
1120         }
1121
1122         return (1);
1123 }
1124
1125 static int
1126 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1127         struct nvme_completion* compl)
1128 {
1129         void *dest;
1130         uint16_t status;
1131
1132         DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1133                 command->cdw10 & 0xFF, command->nsid);
1134
1135         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1136
1137         switch (command->cdw10 & 0xFF) {
1138         case 0x00: /* return Identify Namespace data structure */
1139                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1140                     command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1141                     NVME_COPY_TO_PRP);
1142                 break;
1143         case 0x01: /* return Identify Controller data structure */
1144                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1145                     command->prp2, (uint8_t *)&sc->ctrldata,
1146                     sizeof(sc->ctrldata),
1147                     NVME_COPY_TO_PRP);
1148                 break;
1149         case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1150                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1151                                   sizeof(uint32_t) * 1024);
1152                 /* All unused entries shall be zero */
1153                 bzero(dest, sizeof(uint32_t) * 1024);
1154                 ((uint32_t *)dest)[0] = 1;
1155                 break;
1156         case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1157                 if (command->nsid != 1) {
1158                         pci_nvme_status_genc(&status,
1159                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1160                         break;
1161                 }
1162                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1163                                   sizeof(uint32_t) * 1024);
1164                 /* All bytes after the descriptor shall be zero */
1165                 bzero(dest, sizeof(uint32_t) * 1024);
1166
1167                 /* Return NIDT=1 (i.e. EUI64) descriptor */
1168                 ((uint8_t *)dest)[0] = 1;
1169                 ((uint8_t *)dest)[1] = sizeof(uint64_t);
1170                 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1171                 break;
1172         default:
1173                 DPRINTF("%s unsupported identify command requested 0x%x",
1174                          __func__, command->cdw10 & 0xFF);
1175                 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1176                 break;
1177         }
1178
1179         compl->status = status;
1180         return (1);
1181 }
1182
1183 static const char *
1184 nvme_fid_to_name(uint8_t fid)
1185 {
1186         const char *name;
1187
1188         switch (fid) {
1189         case NVME_FEAT_ARBITRATION:
1190                 name = "Arbitration";
1191                 break;
1192         case NVME_FEAT_POWER_MANAGEMENT:
1193                 name = "Power Management";
1194                 break;
1195         case NVME_FEAT_LBA_RANGE_TYPE:
1196                 name = "LBA Range Type";
1197                 break;
1198         case NVME_FEAT_TEMPERATURE_THRESHOLD:
1199                 name = "Temperature Threshold";
1200                 break;
1201         case NVME_FEAT_ERROR_RECOVERY:
1202                 name = "Error Recovery";
1203                 break;
1204         case NVME_FEAT_VOLATILE_WRITE_CACHE:
1205                 name = "Volatile Write Cache";
1206                 break;
1207         case NVME_FEAT_NUMBER_OF_QUEUES:
1208                 name = "Number of Queues";
1209                 break;
1210         case NVME_FEAT_INTERRUPT_COALESCING:
1211                 name = "Interrupt Coalescing";
1212                 break;
1213         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1214                 name = "Interrupt Vector Configuration";
1215                 break;
1216         case NVME_FEAT_WRITE_ATOMICITY:
1217                 name = "Write Atomicity Normal";
1218                 break;
1219         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1220                 name = "Asynchronous Event Configuration";
1221                 break;
1222         case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1223                 name = "Autonomous Power State Transition";
1224                 break;
1225         case NVME_FEAT_HOST_MEMORY_BUFFER:
1226                 name = "Host Memory Buffer";
1227                 break;
1228         case NVME_FEAT_TIMESTAMP:
1229                 name = "Timestamp";
1230                 break;
1231         case NVME_FEAT_KEEP_ALIVE_TIMER:
1232                 name = "Keep Alive Timer";
1233                 break;
1234         case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1235                 name = "Host Controlled Thermal Management";
1236                 break;
1237         case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1238                 name = "Non-Operation Power State Config";
1239                 break;
1240         case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1241                 name = "Read Recovery Level Config";
1242                 break;
1243         case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1244                 name = "Predictable Latency Mode Config";
1245                 break;
1246         case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1247                 name = "Predictable Latency Mode Window";
1248                 break;
1249         case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1250                 name = "LBA Status Information Report Interval";
1251                 break;
1252         case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1253                 name = "Host Behavior Support";
1254                 break;
1255         case NVME_FEAT_SANITIZE_CONFIG:
1256                 name = "Sanitize Config";
1257                 break;
1258         case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1259                 name = "Endurance Group Event Configuration";
1260                 break;
1261         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1262                 name = "Software Progress Marker";
1263                 break;
1264         case NVME_FEAT_HOST_IDENTIFIER:
1265                 name = "Host Identifier";
1266                 break;
1267         case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1268                 name = "Reservation Notification Mask";
1269                 break;
1270         case NVME_FEAT_RESERVATION_PERSISTENCE:
1271                 name = "Reservation Persistence";
1272                 break;
1273         case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1274                 name = "Namespace Write Protection Config";
1275                 break;
1276         default:
1277                 name = "Unknown";
1278                 break;
1279         }
1280
1281         return (name);
1282 }
1283
1284 static void
1285 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1286     struct nvme_feature_obj *feat,
1287     struct nvme_command *command,
1288     struct nvme_completion *compl)
1289 {
1290
1291         pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1292 }
1293
1294 static void
1295 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1296     struct nvme_feature_obj *feat,
1297     struct nvme_command *command,
1298     struct nvme_completion *compl)
1299 {
1300         uint32_t i;
1301         uint32_t cdw11 = command->cdw11;
1302         uint16_t iv;
1303         bool cd;
1304
1305         pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1306
1307         iv = cdw11 & 0xffff;
1308         cd = cdw11 & (1 << 16);
1309
1310         if (iv > (sc->max_queues + 1)) {
1311                 return;
1312         }
1313
1314         /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1315         if ((iv == 0) && !cd)
1316                 return;
1317
1318         /* Requested Interrupt Vector must be used by a CQ */
1319         for (i = 0; i < sc->num_cqueues + 1; i++) {
1320                 if (sc->compl_queues[i].intr_vec == iv) {
1321                         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1322                 }
1323         }
1324
1325 }
1326
1327 static void
1328 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1329     struct nvme_feature_obj *feat,
1330     struct nvme_command *command,
1331     struct nvme_completion *compl)
1332 {
1333         uint16_t nqr;   /* Number of Queues Requested */
1334
1335         if (sc->num_q_is_set) {
1336                 WPRINTF("%s: Number of Queues already set", __func__);
1337                 pci_nvme_status_genc(&compl->status,
1338                     NVME_SC_COMMAND_SEQUENCE_ERROR);
1339                 return;
1340         }
1341
1342         nqr = command->cdw11 & 0xFFFF;
1343         if (nqr == 0xffff) {
1344                 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1345                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1346                 return;
1347         }
1348
1349         sc->num_squeues = ONE_BASED(nqr);
1350         if (sc->num_squeues > sc->max_queues) {
1351                 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1352                                         sc->max_queues);
1353                 sc->num_squeues = sc->max_queues;
1354         }
1355
1356         nqr = (command->cdw11 >> 16) & 0xFFFF;
1357         if (nqr == 0xffff) {
1358                 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1359                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1360                 return;
1361         }
1362
1363         sc->num_cqueues = ONE_BASED(nqr);
1364         if (sc->num_cqueues > sc->max_queues) {
1365                 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1366                                         sc->max_queues);
1367                 sc->num_cqueues = sc->max_queues;
1368         }
1369
1370         /* Patch the command value which will be saved on callback's return */
1371         command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1372         compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1373
1374         sc->num_q_is_set = true;
1375 }
1376
1377 static int
1378 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1379         struct nvme_completion *compl)
1380 {
1381         struct nvme_feature_obj *feat;
1382         uint32_t nsid = command->nsid;
1383         uint8_t fid = command->cdw10 & 0xFF;
1384
1385         DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1386
1387         if (fid >= NVME_FID_MAX) {
1388                 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1389                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1390                 return (1);
1391         }
1392         feat = &sc->feat[fid];
1393
1394         if (!feat->namespace_specific &&
1395             !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1396                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1397                     NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1398                 return (1);
1399         }
1400
1401         compl->cdw0 = 0;
1402         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1403
1404         if (feat->set)
1405                 feat->set(sc, feat, command, compl);
1406
1407         if (compl->status == NVME_SC_SUCCESS)
1408                 feat->cdw11 = command->cdw11;
1409
1410         return (0);
1411 }
1412
1413 static int
1414 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1415         struct nvme_completion* compl)
1416 {
1417         struct nvme_feature_obj *feat;
1418         uint8_t fid = command->cdw10 & 0xFF;
1419
1420         DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1421
1422         if (fid >= NVME_FID_MAX) {
1423                 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1424                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1425                 return (1);
1426         }
1427
1428         compl->cdw0 = 0;
1429         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1430
1431         feat = &sc->feat[fid];
1432         if (feat->get) {
1433                 feat->get(sc, feat, command, compl);
1434         }
1435
1436         if (compl->status == NVME_SC_SUCCESS) {
1437                 compl->cdw0 = feat->cdw11;
1438         }
1439
1440         return (0);
1441 }
1442
1443 static int
1444 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1445         struct nvme_completion* compl)
1446 {
1447         uint8_t ses, lbaf, pi;
1448
1449         /* Only supports Secure Erase Setting - User Data Erase */
1450         ses = (command->cdw10 >> 9) & 0x7;
1451         if (ses > 0x1) {
1452                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1453                 return (1);
1454         }
1455
1456         /* Only supports a single LBA Format */
1457         lbaf = command->cdw10 & 0xf;
1458         if (lbaf != 0) {
1459                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1460                     NVME_SC_INVALID_FORMAT);
1461                 return (1);
1462         }
1463
1464         /* Doesn't support Protection Infomation */
1465         pi = (command->cdw10 >> 5) & 0x7;
1466         if (pi != 0) {
1467                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1468                 return (1);
1469         }
1470
1471         if (sc->nvstore.type == NVME_STOR_RAM) {
1472                 if (sc->nvstore.ctx)
1473                         free(sc->nvstore.ctx);
1474                 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1475                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1476         } else {
1477                 struct pci_nvme_ioreq *req;
1478                 int err;
1479
1480                 req = pci_nvme_get_ioreq(sc);
1481                 if (req == NULL) {
1482                         pci_nvme_status_genc(&compl->status,
1483                             NVME_SC_INTERNAL_DEVICE_ERROR);
1484                         WPRINTF("%s: unable to allocate IO req", __func__);
1485                         return (1);
1486                 }
1487                 req->nvme_sq = &sc->submit_queues[0];
1488                 req->sqid = 0;
1489                 req->opc = command->opc;
1490                 req->cid = command->cid;
1491                 req->nsid = command->nsid;
1492
1493                 req->io_req.br_offset = 0;
1494                 req->io_req.br_resid = sc->nvstore.size;
1495                 req->io_req.br_callback = pci_nvme_io_done;
1496
1497                 err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1498                 if (err) {
1499                         pci_nvme_status_genc(&compl->status,
1500                             NVME_SC_INTERNAL_DEVICE_ERROR);
1501                         pci_nvme_release_ioreq(sc, req);
1502                 }
1503         }
1504
1505         return (1);
1506 }
1507
1508 static int
1509 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1510         struct nvme_completion* compl)
1511 {
1512         DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1513                 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1514
1515         /* TODO: search for the command ID and abort it */
1516
1517         compl->cdw0 = 1;
1518         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1519         return (1);
1520 }
1521
1522 static int
1523 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1524         struct nvme_command* command, struct nvme_completion* compl)
1525 {
1526         DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1527
1528         /* Don't exceed the Async Event Request Limit (AERL). */
1529         if (pci_nvme_aer_limit_reached(sc)) {
1530                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1531                                 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1532                 return (1);
1533         }
1534
1535         if (pci_nvme_aer_add(sc, command->cid)) {
1536                 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1537                                 NVME_SC_INTERNAL_DEVICE_ERROR);
1538                 return (1);
1539         }
1540
1541         /*
1542          * Raise events when they happen based on the Set Features cmd.
1543          * These events happen async, so only set completion successful if
1544          * there is an event reflective of the request to get event.
1545          */
1546         compl->status = NVME_NO_STATUS;
1547
1548         return (0);
1549 }
1550
1551 static void
1552 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1553 {
1554         struct nvme_completion compl;
1555         struct nvme_command *cmd;
1556         struct nvme_submission_queue *sq;
1557         struct nvme_completion_queue *cq;
1558         uint16_t sqhead;
1559
1560         DPRINTF("%s index %u", __func__, (uint32_t)value);
1561
1562         sq = &sc->submit_queues[0];
1563         cq = &sc->compl_queues[0];
1564
1565         pthread_mutex_lock(&sq->mtx);
1566
1567         sqhead = sq->head;
1568         DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1569         
1570         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1571                 cmd = &(sq->qbase)[sqhead];
1572                 compl.status = 0;
1573
1574                 switch (cmd->opc) {
1575                 case NVME_OPC_DELETE_IO_SQ:
1576                         DPRINTF("%s command DELETE_IO_SQ", __func__);
1577                         nvme_opc_delete_io_sq(sc, cmd, &compl);
1578                         break;
1579                 case NVME_OPC_CREATE_IO_SQ:
1580                         DPRINTF("%s command CREATE_IO_SQ", __func__);
1581                         nvme_opc_create_io_sq(sc, cmd, &compl);
1582                         break;
1583                 case NVME_OPC_DELETE_IO_CQ:
1584                         DPRINTF("%s command DELETE_IO_CQ", __func__);
1585                         nvme_opc_delete_io_cq(sc, cmd, &compl);
1586                         break;
1587                 case NVME_OPC_CREATE_IO_CQ:
1588                         DPRINTF("%s command CREATE_IO_CQ", __func__);
1589                         nvme_opc_create_io_cq(sc, cmd, &compl);
1590                         break;
1591                 case NVME_OPC_GET_LOG_PAGE:
1592                         DPRINTF("%s command GET_LOG_PAGE", __func__);
1593                         nvme_opc_get_log_page(sc, cmd, &compl);
1594                         break;
1595                 case NVME_OPC_IDENTIFY:
1596                         DPRINTF("%s command IDENTIFY", __func__);
1597                         nvme_opc_identify(sc, cmd, &compl);
1598                         break;
1599                 case NVME_OPC_ABORT:
1600                         DPRINTF("%s command ABORT", __func__);
1601                         nvme_opc_abort(sc, cmd, &compl);
1602                         break;
1603                 case NVME_OPC_SET_FEATURES:
1604                         DPRINTF("%s command SET_FEATURES", __func__);
1605                         nvme_opc_set_features(sc, cmd, &compl);
1606                         break;
1607                 case NVME_OPC_GET_FEATURES:
1608                         DPRINTF("%s command GET_FEATURES", __func__);
1609                         nvme_opc_get_features(sc, cmd, &compl);
1610                         break;
1611                 case NVME_OPC_FIRMWARE_ACTIVATE:
1612                         DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
1613                         pci_nvme_status_tc(&compl.status,
1614                             NVME_SCT_COMMAND_SPECIFIC,
1615                             NVME_SC_INVALID_FIRMWARE_SLOT);
1616                         break;
1617                 case NVME_OPC_ASYNC_EVENT_REQUEST:
1618                         DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1619                         nvme_opc_async_event_req(sc, cmd, &compl);
1620                         break;
1621                 case NVME_OPC_FORMAT_NVM:
1622                         DPRINTF("%s command FORMAT_NVM", __func__);
1623                         if ((sc->ctrldata.oacs &
1624                             (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1625                                 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1626                         }
1627                         compl.status = NVME_NO_STATUS;
1628                         nvme_opc_format_nvm(sc, cmd, &compl);
1629                         break;
1630                 default:
1631                         DPRINTF("0x%x command is not implemented",
1632                             cmd->opc);
1633                         pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1634                 }
1635                 sqhead = (sqhead + 1) % sq->size;
1636
1637                 if (NVME_COMPLETION_VALID(compl)) {
1638                         pci_nvme_cq_update(sc, &sc->compl_queues[0],
1639                             compl.cdw0,
1640                             cmd->cid,
1641                             0,          /* SQID */
1642                             compl.status);
1643                 }
1644         }
1645
1646         DPRINTF("setting sqhead %u", sqhead);
1647         sq->head = sqhead;
1648
1649         if (cq->head != cq->tail)
1650                 pci_generate_msix(sc->nsc_pi, 0);
1651
1652         pthread_mutex_unlock(&sq->mtx);
1653 }
1654
1655 /*
1656  * Update the Write and Read statistics reported in SMART data
1657  *
1658  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
1659  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
1660  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
1661  */
1662 static void
1663 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
1664     size_t bytes, uint16_t status)
1665 {
1666
1667         pthread_mutex_lock(&sc->mtx);
1668         switch (opc) {
1669         case NVME_OPC_WRITE:
1670                 sc->write_commands++;
1671                 if (status != NVME_SC_SUCCESS)
1672                         break;
1673                 sc->write_dunits_remainder += (bytes / 512);
1674                 while (sc->write_dunits_remainder >= 1000) {
1675                         sc->write_data_units++;
1676                         sc->write_dunits_remainder -= 1000;
1677                 }
1678                 break;
1679         case NVME_OPC_READ:
1680                 sc->read_commands++;
1681                 if (status != NVME_SC_SUCCESS)
1682                         break;
1683                 sc->read_dunits_remainder += (bytes / 512);
1684                 while (sc->read_dunits_remainder >= 1000) {
1685                         sc->read_data_units++;
1686                         sc->read_dunits_remainder -= 1000;
1687                 }
1688                 break;
1689         default:
1690                 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
1691                 break;
1692         }
1693         pthread_mutex_unlock(&sc->mtx);
1694 }
1695
1696 /*
1697  * Check if the combination of Starting LBA (slba) and Number of Logical
1698  * Blocks (nlb) exceeds the range of the underlying storage.
1699  *
1700  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
1701  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
1702  * overflow.
1703  */
1704 static bool
1705 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
1706     uint32_t nlb)
1707 {
1708         size_t  offset, bytes;
1709
1710         /* Overflow check of multiplying Starting LBA by the sector size */
1711         if (slba >> (64 - nvstore->sectsz_bits))
1712                 return (true);
1713
1714         offset = slba << nvstore->sectsz_bits;
1715         bytes = nlb << nvstore->sectsz_bits;
1716
1717         /* Overflow check of Number of Logical Blocks */
1718         if ((nvstore->size - offset) < bytes)
1719                 return (true);
1720
1721         return (false);
1722 }
1723
1724 static int
1725 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1726         uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1727 {
1728         int iovidx;
1729
1730         if (req == NULL)
1731                 return (-1);
1732
1733         if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
1734                 return (-1);
1735         }
1736
1737         /* concatenate contig block-iovs to minimize number of iovs */
1738         if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1739                 iovidx = req->io_req.br_iovcnt - 1;
1740
1741                 req->io_req.br_iov[iovidx].iov_base =
1742                     paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1743                                      req->prev_gpaddr, size);
1744
1745                 req->prev_size += size;
1746                 req->io_req.br_resid += size;
1747
1748                 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1749         } else {
1750                 iovidx = req->io_req.br_iovcnt;
1751                 if (iovidx == 0) {
1752                         req->io_req.br_offset = lba;
1753                         req->io_req.br_resid = 0;
1754                         req->io_req.br_param = req;
1755                 }
1756
1757                 req->io_req.br_iov[iovidx].iov_base =
1758                     paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1759                                      gpaddr, size);
1760
1761                 req->io_req.br_iov[iovidx].iov_len = size;
1762
1763                 req->prev_gpaddr = gpaddr;
1764                 req->prev_size = size;
1765                 req->io_req.br_resid += size;
1766
1767                 req->io_req.br_iovcnt++;
1768         }
1769
1770         return (0);
1771 }
1772
1773 static void
1774 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1775         struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1776         uint32_t cdw0, uint16_t status)
1777 {
1778         struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1779
1780         DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1781                  __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1782                  NVME_STATUS_GET_SC(status));
1783
1784         pci_nvme_cq_update(sc, cq,
1785             0,          /* CDW0 */
1786             cid,
1787             sqid,
1788             status);
1789
1790         if (cq->head != cq->tail) {
1791                 if (cq->intr_en & NVME_CQ_INTEN) {
1792                         pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1793                 } else {
1794                         DPRINTF("%s: CQ%u interrupt disabled",
1795                                                 __func__, sq->cqid);
1796                 }
1797         }
1798 }
1799
1800 static void
1801 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1802 {
1803         req->sc = NULL;
1804         req->nvme_sq = NULL;
1805         req->sqid = 0;
1806
1807         pthread_mutex_lock(&sc->mtx);
1808
1809         STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1810         sc->pending_ios--;
1811
1812         /* when no more IO pending, can set to ready if device reset/enabled */
1813         if (sc->pending_ios == 0 &&
1814             NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1815                 sc->regs.csts |= NVME_CSTS_RDY;
1816
1817         pthread_mutex_unlock(&sc->mtx);
1818
1819         sem_post(&sc->iosemlock);
1820 }
1821
1822 static struct pci_nvme_ioreq *
1823 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1824 {
1825         struct pci_nvme_ioreq *req = NULL;;
1826
1827         sem_wait(&sc->iosemlock);
1828         pthread_mutex_lock(&sc->mtx);
1829
1830         req = STAILQ_FIRST(&sc->ioreqs_free);
1831         assert(req != NULL);
1832         STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1833
1834         req->sc = sc;
1835
1836         sc->pending_ios++;
1837
1838         pthread_mutex_unlock(&sc->mtx);
1839
1840         req->io_req.br_iovcnt = 0;
1841         req->io_req.br_offset = 0;
1842         req->io_req.br_resid = 0;
1843         req->io_req.br_param = req;
1844         req->prev_gpaddr = 0;
1845         req->prev_size = 0;
1846
1847         return req;
1848 }
1849
1850 static void
1851 pci_nvme_io_done(struct blockif_req *br, int err)
1852 {
1853         struct pci_nvme_ioreq *req = br->br_param;
1854         struct nvme_submission_queue *sq = req->nvme_sq;
1855         uint16_t code, status;
1856
1857         DPRINTF("%s error %d %s", __func__, err, strerror(err));
1858
1859         /* TODO return correct error */
1860         code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1861         pci_nvme_status_genc(&status, code);
1862
1863         pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1864         pci_nvme_stats_write_read_update(req->sc, req->opc,
1865             req->bytes, status);
1866         pci_nvme_release_ioreq(req->sc, req);
1867 }
1868
1869 /*
1870  * Implements the Flush command. The specification states:
1871  *    If a volatile write cache is not present, Flush commands complete
1872  *    successfully and have no effect
1873  * in the description of the Volatile Write Cache (VWC) field of the Identify
1874  * Controller data. Therefore, set status to Success if the command is
1875  * not supported (i.e. RAM or as indicated by the blockif).
1876  */
1877 static bool
1878 nvme_opc_flush(struct pci_nvme_softc *sc,
1879     struct nvme_command *cmd,
1880     struct pci_nvme_blockstore *nvstore,
1881     struct pci_nvme_ioreq *req,
1882     uint16_t *status)
1883 {
1884         bool pending = false;
1885
1886         if (nvstore->type == NVME_STOR_RAM) {
1887                 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1888         } else {
1889                 int err;
1890
1891                 req->io_req.br_callback = pci_nvme_io_done;
1892
1893                 err = blockif_flush(nvstore->ctx, &req->io_req);
1894                 switch (err) {
1895                 case 0:
1896                         pending = true;
1897                         break;
1898                 case EOPNOTSUPP:
1899                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1900                         break;
1901                 default:
1902                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1903                 }
1904         }
1905
1906         return (pending);
1907 }
1908
1909 static uint16_t
1910 nvme_write_read_ram(struct pci_nvme_softc *sc,
1911     struct pci_nvme_blockstore *nvstore,
1912     uint64_t prp1, uint64_t prp2,
1913     size_t offset, uint64_t bytes,
1914     bool is_write)
1915 {
1916         uint8_t *buf = nvstore->ctx;
1917         enum nvme_copy_dir dir;
1918         uint16_t status;
1919
1920         if (is_write)
1921                 dir = NVME_COPY_TO_PRP;
1922         else
1923                 dir = NVME_COPY_FROM_PRP;
1924
1925         if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
1926             buf + offset, bytes, dir))
1927                 pci_nvme_status_genc(&status,
1928                     NVME_SC_DATA_TRANSFER_ERROR);
1929         else
1930                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1931
1932         return (status);
1933 }
1934
1935 static uint16_t
1936 nvme_write_read_blockif(struct pci_nvme_softc *sc,
1937     struct pci_nvme_blockstore *nvstore,
1938     struct pci_nvme_ioreq *req,
1939     uint64_t prp1, uint64_t prp2,
1940     size_t offset, uint64_t bytes,
1941     bool is_write)
1942 {
1943         uint64_t size;
1944         int err;
1945         uint16_t status = NVME_NO_STATUS;
1946
1947         size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
1948         if (pci_nvme_append_iov_req(sc, req, prp1,
1949             size, is_write, offset)) {
1950                 pci_nvme_status_genc(&status,
1951                     NVME_SC_DATA_TRANSFER_ERROR);
1952                 goto out;
1953         }
1954
1955         offset += size;
1956         bytes  -= size;
1957
1958         if (bytes == 0) {
1959                 ;
1960         } else if (bytes <= PAGE_SIZE) {
1961                 size = bytes;
1962                 if (pci_nvme_append_iov_req(sc, req, prp2,
1963                     size, is_write, offset)) {
1964                         pci_nvme_status_genc(&status,
1965                             NVME_SC_DATA_TRANSFER_ERROR);
1966                         goto out;
1967                 }
1968         } else {
1969                 void *vmctx = sc->nsc_pi->pi_vmctx;
1970                 uint64_t *prp_list = &prp2;
1971                 uint64_t *last = prp_list;
1972
1973                 /* PRP2 is pointer to a physical region page list */
1974                 while (bytes) {
1975                         /* Last entry in list points to the next list */
1976                         if ((prp_list == last) && (bytes > PAGE_SIZE)) {
1977                                 uint64_t prp = *prp_list;
1978
1979                                 prp_list = paddr_guest2host(vmctx, prp,
1980                                     PAGE_SIZE - (prp % PAGE_SIZE));
1981                                 last = prp_list + (NVME_PRP2_ITEMS - 1);
1982                         }
1983
1984                         size = MIN(bytes, PAGE_SIZE);
1985
1986                         if (pci_nvme_append_iov_req(sc, req, *prp_list,
1987                             size, is_write, offset)) {
1988                                 pci_nvme_status_genc(&status,
1989                                     NVME_SC_DATA_TRANSFER_ERROR);
1990                                 goto out;
1991                         }
1992
1993                         offset += size;
1994                         bytes  -= size;
1995
1996                         prp_list++;
1997                 }
1998         }
1999         req->io_req.br_callback = pci_nvme_io_done;
2000         if (is_write)
2001                 err = blockif_write(nvstore->ctx, &req->io_req);
2002         else
2003                 err = blockif_read(nvstore->ctx, &req->io_req);
2004
2005         if (err)
2006                 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2007 out:
2008         return (status);
2009 }
2010
2011 static bool
2012 nvme_opc_write_read(struct pci_nvme_softc *sc,
2013     struct nvme_command *cmd,
2014     struct pci_nvme_blockstore *nvstore,
2015     struct pci_nvme_ioreq *req,
2016     uint16_t *status)
2017 {
2018         uint64_t lba, nblocks, bytes;
2019         size_t offset;
2020         bool is_write = cmd->opc == NVME_OPC_WRITE;
2021         bool pending = false;
2022
2023         lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2024         nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2025         if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2026                 WPRINTF("%s command would exceed LBA range", __func__);
2027                 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2028                 goto out;
2029         }
2030
2031         bytes  = nblocks << nvstore->sectsz_bits;
2032         if (bytes > NVME_MAX_DATA_SIZE) {
2033                 WPRINTF("%s command would exceed MDTS", __func__);
2034                 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2035                 goto out;
2036         }
2037
2038         offset = lba << nvstore->sectsz_bits;
2039
2040         req->bytes = bytes;
2041         req->io_req.br_offset = lba;
2042
2043         /* PRP bits 1:0 must be zero */
2044         cmd->prp1 &= ~0x3UL;
2045         cmd->prp2 &= ~0x3UL;
2046
2047         if (nvstore->type == NVME_STOR_RAM) {
2048                 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2049                     cmd->prp2, offset, bytes, is_write);
2050         } else {
2051                 *status = nvme_write_read_blockif(sc, nvstore, req,
2052                     cmd->prp1, cmd->prp2, offset, bytes, is_write);
2053
2054                 if (*status == NVME_NO_STATUS)
2055                         pending = true;
2056         }
2057 out:
2058         if (!pending)
2059                 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2060
2061         return (pending);
2062 }
2063
2064 static void
2065 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2066 {
2067         struct pci_nvme_ioreq *req = br->br_param;
2068         struct pci_nvme_softc *sc = req->sc;
2069         bool done = true;
2070         uint16_t status;
2071
2072         if (err) {
2073                 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2074         } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2075                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2076         } else {
2077                 struct iovec *iov = req->io_req.br_iov;
2078
2079                 req->prev_gpaddr++;
2080                 iov += req->prev_gpaddr;
2081
2082                 /* The iov_* values already include the sector size */
2083                 req->io_req.br_offset = (off_t)iov->iov_base;
2084                 req->io_req.br_resid = iov->iov_len;
2085                 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2086                         pci_nvme_status_genc(&status,
2087                             NVME_SC_INTERNAL_DEVICE_ERROR);
2088                 } else
2089                         done = false;
2090         }
2091
2092         if (done) {
2093                 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2094                     req->cid, 0, status);
2095                 pci_nvme_release_ioreq(sc, req);
2096         }
2097 }
2098
2099 static bool
2100 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2101     struct nvme_command *cmd,
2102     struct pci_nvme_blockstore *nvstore,
2103     struct pci_nvme_ioreq *req,
2104     uint16_t *status)
2105 {
2106         struct nvme_dsm_range *range;
2107         uint32_t nr, r, non_zero, dr;
2108         int err;
2109         bool pending = false;
2110
2111         if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2112                 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2113                 goto out;
2114         }
2115
2116         nr = cmd->cdw10 & 0xff;
2117
2118         /* copy locally because a range entry could straddle PRPs */
2119         range = calloc(1, NVME_MAX_DSM_TRIM);
2120         if (range == NULL) {
2121                 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2122                 goto out;
2123         }
2124         nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2125             (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2126
2127         /* Check for invalid ranges and the number of non-zero lengths */
2128         non_zero = 0;
2129         for (r = 0; r <= nr; r++) {
2130                 if (pci_nvme_out_of_range(nvstore,
2131                     range[r].starting_lba, range[r].length)) {
2132                         pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2133                         goto out;
2134                 }
2135                 if (range[r].length != 0)
2136                         non_zero++;
2137         }
2138
2139         if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2140                 size_t offset, bytes;
2141                 int sectsz_bits = sc->nvstore.sectsz_bits;
2142
2143                 /*
2144                  * DSM calls are advisory only, and compliant controllers
2145                  * may choose to take no actions (i.e. return Success).
2146                  */
2147                 if (!nvstore->deallocate) {
2148                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2149                         goto out;
2150                 }
2151
2152                 /* If all ranges have a zero length, return Success */
2153                 if (non_zero == 0) {
2154                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2155                         goto out;
2156                 }
2157
2158                 if (req == NULL) {
2159                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2160                         goto out;
2161                 }
2162
2163                 offset = range[0].starting_lba << sectsz_bits;
2164                 bytes = range[0].length << sectsz_bits;
2165
2166                 /*
2167                  * If the request is for more than a single range, store
2168                  * the ranges in the br_iov. Optimize for the common case
2169                  * of a single range.
2170                  *
2171                  * Note that NVMe Number of Ranges is a zero based value
2172                  */
2173                 req->io_req.br_iovcnt = 0;
2174                 req->io_req.br_offset = offset;
2175                 req->io_req.br_resid = bytes;
2176
2177                 if (nr == 0) {
2178                         req->io_req.br_callback = pci_nvme_io_done;
2179                 } else {
2180                         struct iovec *iov = req->io_req.br_iov;
2181
2182                         for (r = 0, dr = 0; r <= nr; r++) {
2183                                 offset = range[r].starting_lba << sectsz_bits;
2184                                 bytes = range[r].length << sectsz_bits;
2185                                 if (bytes == 0)
2186                                         continue;
2187
2188                                 if ((nvstore->size - offset) < bytes) {
2189                                         pci_nvme_status_genc(status,
2190                                             NVME_SC_LBA_OUT_OF_RANGE);
2191                                         goto out;
2192                                 }
2193                                 iov[dr].iov_base = (void *)offset;
2194                                 iov[dr].iov_len = bytes;
2195                                 dr++;
2196                         }
2197                         req->io_req.br_callback = pci_nvme_dealloc_sm;
2198
2199                         /*
2200                          * Use prev_gpaddr to track the current entry and
2201                          * prev_size to track the number of entries
2202                          */
2203                         req->prev_gpaddr = 0;
2204                         req->prev_size = dr;
2205                 }
2206
2207                 err = blockif_delete(nvstore->ctx, &req->io_req);
2208                 if (err)
2209                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2210                 else
2211                         pending = true;
2212         }
2213 out:
2214         free(range);
2215         return (pending);
2216 }
2217
2218 static void
2219 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2220 {
2221         struct nvme_submission_queue *sq;
2222         uint16_t status;
2223         uint16_t sqhead;
2224
2225         /* handle all submissions up to sq->tail index */
2226         sq = &sc->submit_queues[idx];
2227
2228         pthread_mutex_lock(&sq->mtx);
2229
2230         sqhead = sq->head;
2231         DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2232                  idx, sqhead, sq->tail, sq->qbase);
2233
2234         while (sqhead != atomic_load_acq_short(&sq->tail)) {
2235                 struct nvme_command *cmd;
2236                 struct pci_nvme_ioreq *req;
2237                 uint32_t nsid;
2238                 bool pending;
2239
2240                 pending = false;
2241                 req = NULL;
2242                 status = 0;
2243
2244                 cmd = &sq->qbase[sqhead];
2245                 sqhead = (sqhead + 1) % sq->size;
2246
2247                 nsid = le32toh(cmd->nsid);
2248                 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2249                         pci_nvme_status_genc(&status,
2250                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2251                         status |=
2252                             NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2253                         goto complete;
2254                 }
2255
2256                 req = pci_nvme_get_ioreq(sc);
2257                 if (req == NULL) {
2258                         pci_nvme_status_genc(&status,
2259                             NVME_SC_INTERNAL_DEVICE_ERROR);
2260                         WPRINTF("%s: unable to allocate IO req", __func__);
2261                         goto complete;
2262                 }
2263                 req->nvme_sq = sq;
2264                 req->sqid = idx;
2265                 req->opc = cmd->opc;
2266                 req->cid = cmd->cid;
2267                 req->nsid = cmd->nsid;
2268
2269                 switch (cmd->opc) {
2270                 case NVME_OPC_FLUSH:
2271                         pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2272                             req, &status);
2273                         break;
2274                 case NVME_OPC_WRITE:
2275                 case NVME_OPC_READ:
2276                         pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2277                             req, &status);
2278                         break;
2279                 case NVME_OPC_WRITE_ZEROES:
2280                         /* TODO: write zeroes
2281                         WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2282                                 __func__, lba, cmd->cdw12 & 0xFFFF); */
2283                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2284                         break;
2285                 case NVME_OPC_DATASET_MANAGEMENT:
2286                         pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2287                             req, &status);
2288                         break;
2289                 default:
2290                         WPRINTF("%s unhandled io command 0x%x",
2291                             __func__, cmd->opc);
2292                         pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2293                 }
2294 complete:
2295                 if (!pending) {
2296                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2297                             status);
2298                         if (req != NULL)
2299                                 pci_nvme_release_ioreq(sc, req);
2300                 }
2301         }
2302
2303         sq->head = sqhead;
2304
2305         pthread_mutex_unlock(&sq->mtx);
2306 }
2307
2308 static void
2309 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2310         uint64_t idx, int is_sq, uint64_t value)
2311 {
2312         DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2313                 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2314
2315         if (is_sq) {
2316                 if (idx > sc->num_squeues) {
2317                         WPRINTF("%s queue index %lu overflow from "
2318                                  "guest (max %u)",
2319                                  __func__, idx, sc->num_squeues);
2320                         return;
2321                 }
2322
2323                 atomic_store_short(&sc->submit_queues[idx].tail,
2324                                    (uint16_t)value);
2325
2326                 if (idx == 0) {
2327                         pci_nvme_handle_admin_cmd(sc, value);
2328                 } else {
2329                         /* submission queue; handle new entries in SQ */
2330                         if (idx > sc->num_squeues) {
2331                                 WPRINTF("%s SQ index %lu overflow from "
2332                                          "guest (max %u)",
2333                                          __func__, idx, sc->num_squeues);
2334                                 return;
2335                         }
2336                         pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2337                 }
2338         } else {
2339                 if (idx > sc->num_cqueues) {
2340                         WPRINTF("%s queue index %lu overflow from "
2341                                  "guest (max %u)",
2342                                  __func__, idx, sc->num_cqueues);
2343                         return;
2344                 }
2345
2346                 atomic_store_short(&sc->compl_queues[idx].head,
2347                                 (uint16_t)value);
2348         }
2349 }
2350
2351 static void
2352 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2353 {
2354         const char *s = iswrite ? "WRITE" : "READ";
2355
2356         switch (offset) {
2357         case NVME_CR_CAP_LOW:
2358                 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2359                 break;
2360         case NVME_CR_CAP_HI:
2361                 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2362                 break;
2363         case NVME_CR_VS:
2364                 DPRINTF("%s %s NVME_CR_VS", func, s);
2365                 break;
2366         case NVME_CR_INTMS:
2367                 DPRINTF("%s %s NVME_CR_INTMS", func, s);
2368                 break;
2369         case NVME_CR_INTMC:
2370                 DPRINTF("%s %s NVME_CR_INTMC", func, s);
2371                 break;
2372         case NVME_CR_CC:
2373                 DPRINTF("%s %s NVME_CR_CC", func, s);
2374                 break;
2375         case NVME_CR_CSTS:
2376                 DPRINTF("%s %s NVME_CR_CSTS", func, s);
2377                 break;
2378         case NVME_CR_NSSR:
2379                 DPRINTF("%s %s NVME_CR_NSSR", func, s);
2380                 break;
2381         case NVME_CR_AQA:
2382                 DPRINTF("%s %s NVME_CR_AQA", func, s);
2383                 break;
2384         case NVME_CR_ASQ_LOW:
2385                 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2386                 break;
2387         case NVME_CR_ASQ_HI:
2388                 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2389                 break;
2390         case NVME_CR_ACQ_LOW:
2391                 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2392                 break;
2393         case NVME_CR_ACQ_HI:
2394                 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2395                 break;
2396         default:
2397                 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2398         }
2399
2400 }
2401
2402 static void
2403 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2404         uint64_t offset, int size, uint64_t value)
2405 {
2406         uint32_t ccreg;
2407
2408         if (offset >= NVME_DOORBELL_OFFSET) {
2409                 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2410                 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2411                 int is_sq = (belloffset % 8) < 4;
2412
2413                 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2414                         WPRINTF("guest attempted an overflow write offset "
2415                                  "0x%lx, val 0x%lx in %s",
2416                                  offset, value, __func__);
2417                         return;
2418                 }
2419
2420                 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2421                 return;
2422         }
2423
2424         DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2425                 offset, size, value);
2426
2427         if (size != 4) {
2428                 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2429                          "val 0x%lx) to bar0 in %s",
2430                          size, offset, value, __func__);
2431                 /* TODO: shutdown device */
2432                 return;
2433         }
2434
2435         pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2436
2437         pthread_mutex_lock(&sc->mtx);
2438
2439         switch (offset) {
2440         case NVME_CR_CAP_LOW:
2441         case NVME_CR_CAP_HI:
2442                 /* readonly */
2443                 break;
2444         case NVME_CR_VS:
2445                 /* readonly */
2446                 break;
2447         case NVME_CR_INTMS:
2448                 /* MSI-X, so ignore */
2449                 break;
2450         case NVME_CR_INTMC:
2451                 /* MSI-X, so ignore */
2452                 break;
2453         case NVME_CR_CC:
2454                 ccreg = (uint32_t)value;
2455
2456                 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2457                          "iocqes %u",
2458                         __func__,
2459                          NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2460                          NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2461                          NVME_CC_GET_IOCQES(ccreg));
2462
2463                 if (NVME_CC_GET_SHN(ccreg)) {
2464                         /* perform shutdown - flush out data to backend */
2465                         sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2466                             NVME_CSTS_REG_SHST_SHIFT);
2467                         sc->regs.csts |= NVME_SHST_COMPLETE <<
2468                             NVME_CSTS_REG_SHST_SHIFT;
2469                 }
2470                 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2471                         if (NVME_CC_GET_EN(ccreg) == 0)
2472                                 /* transition 1-> causes controller reset */
2473                                 pci_nvme_reset_locked(sc);
2474                         else
2475                                 pci_nvme_init_controller(ctx, sc);
2476                 }
2477
2478                 /* Insert the iocqes, iosqes and en bits from the write */
2479                 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2480                 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2481                 if (NVME_CC_GET_EN(ccreg) == 0) {
2482                         /* Insert the ams, mps and css bit fields */
2483                         sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2484                         sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2485                         sc->regs.csts &= ~NVME_CSTS_RDY;
2486                 } else if (sc->pending_ios == 0) {
2487                         sc->regs.csts |= NVME_CSTS_RDY;
2488                 }
2489                 break;
2490         case NVME_CR_CSTS:
2491                 break;
2492         case NVME_CR_NSSR:
2493                 /* ignore writes; don't support subsystem reset */
2494                 break;
2495         case NVME_CR_AQA:
2496                 sc->regs.aqa = (uint32_t)value;
2497                 break;
2498         case NVME_CR_ASQ_LOW:
2499                 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2500                                (0xFFFFF000 & value);
2501                 break;
2502         case NVME_CR_ASQ_HI:
2503                 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2504                                (value << 32);
2505                 break;
2506         case NVME_CR_ACQ_LOW:
2507                 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2508                                (0xFFFFF000 & value);
2509                 break;
2510         case NVME_CR_ACQ_HI:
2511                 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2512                                (value << 32);
2513                 break;
2514         default:
2515                 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2516                          __func__, offset, value, size);
2517         }
2518         pthread_mutex_unlock(&sc->mtx);
2519 }
2520
2521 static void
2522 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2523                 int baridx, uint64_t offset, int size, uint64_t value)
2524 {
2525         struct pci_nvme_softc* sc = pi->pi_arg;
2526
2527         if (baridx == pci_msix_table_bar(pi) ||
2528             baridx == pci_msix_pba_bar(pi)) {
2529                 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2530                          " value 0x%lx", baridx, offset, size, value);
2531
2532                 pci_emul_msix_twrite(pi, offset, size, value);
2533                 return;
2534         }
2535
2536         switch (baridx) {
2537         case 0:
2538                 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2539                 break;
2540
2541         default:
2542                 DPRINTF("%s unknown baridx %d, val 0x%lx",
2543                          __func__, baridx, value);
2544         }
2545 }
2546
2547 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2548         uint64_t offset, int size)
2549 {
2550         uint64_t value;
2551
2552         pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2553
2554         if (offset < NVME_DOORBELL_OFFSET) {
2555                 void *p = &(sc->regs);
2556                 pthread_mutex_lock(&sc->mtx);
2557                 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2558                 pthread_mutex_unlock(&sc->mtx);
2559         } else {
2560                 value = 0;
2561                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2562         }
2563
2564         switch (size) {
2565         case 1:
2566                 value &= 0xFF;
2567                 break;
2568         case 2:
2569                 value &= 0xFFFF;
2570                 break;
2571         case 4:
2572                 value &= 0xFFFFFFFF;
2573                 break;
2574         }
2575
2576         DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
2577                  offset, size, (uint32_t)value);
2578
2579         return (value);
2580 }
2581
2582
2583
2584 static uint64_t
2585 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2586     uint64_t offset, int size)
2587 {
2588         struct pci_nvme_softc* sc = pi->pi_arg;
2589
2590         if (baridx == pci_msix_table_bar(pi) ||
2591             baridx == pci_msix_pba_bar(pi)) {
2592                 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2593                         baridx, offset, size);
2594
2595                 return pci_emul_msix_tread(pi, offset, size);
2596         }
2597
2598         switch (baridx) {
2599         case 0:
2600                 return pci_nvme_read_bar_0(sc, offset, size);
2601
2602         default:
2603                 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2604         }
2605
2606         return (0);
2607 }
2608
2609
2610 static int
2611 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2612 {
2613         char bident[sizeof("XX:X:X")];
2614         char    *uopt, *xopts, *config;
2615         uint32_t sectsz;
2616         int optidx;
2617
2618         sc->max_queues = NVME_QUEUES;
2619         sc->max_qentries = NVME_MAX_QENTRIES;
2620         sc->ioslots = NVME_IOSLOTS;
2621         sc->num_squeues = sc->max_queues;
2622         sc->num_cqueues = sc->max_queues;
2623         sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2624         sectsz = 0;
2625
2626         uopt = strdup(opts);
2627         optidx = 0;
2628         snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2629                  "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2630         for (xopts = strtok(uopt, ",");
2631              xopts != NULL;
2632              xopts = strtok(NULL, ",")) {
2633
2634                 if ((config = strchr(xopts, '=')) != NULL)
2635                         *config++ = '\0';
2636
2637                 if (!strcmp("maxq", xopts)) {
2638                         sc->max_queues = atoi(config);
2639                 } else if (!strcmp("qsz", xopts)) {
2640                         sc->max_qentries = atoi(config);
2641                 } else if (!strcmp("ioslots", xopts)) {
2642                         sc->ioslots = atoi(config);
2643                 } else if (!strcmp("sectsz", xopts)) {
2644                         sectsz = atoi(config);
2645                 } else if (!strcmp("ser", xopts)) {
2646                         /*
2647                          * This field indicates the Product Serial Number in
2648                          * 7-bit ASCII, unused bytes should be space characters.
2649                          * Ref: NVMe v1.3c.
2650                          */
2651                         cpywithpad((char *)sc->ctrldata.sn,
2652                                    sizeof(sc->ctrldata.sn), config, ' ');
2653                 } else if (!strcmp("ram", xopts)) {
2654                         uint64_t sz = strtoull(&xopts[4], NULL, 10);
2655
2656                         sc->nvstore.type = NVME_STOR_RAM;
2657                         sc->nvstore.size = sz * 1024 * 1024;
2658                         sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2659                         sc->nvstore.sectsz = 4096;
2660                         sc->nvstore.sectsz_bits = 12;
2661                         if (sc->nvstore.ctx == NULL) {
2662                                 perror("Unable to allocate RAM");
2663                                 free(uopt);
2664                                 return (-1);
2665                         }
2666                 } else if (!strcmp("eui64", xopts)) {
2667                         sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2668                 } else if (!strcmp("dsm", xopts)) {
2669                         if (!strcmp("auto", config))
2670                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2671                         else if (!strcmp("enable", config))
2672                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2673                         else if (!strcmp("disable", config))
2674                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2675                 } else if (optidx == 0) {
2676                         snprintf(bident, sizeof(bident), "%d:%d",
2677                                  sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2678                         sc->nvstore.ctx = blockif_open(xopts, bident);
2679                         if (sc->nvstore.ctx == NULL) {
2680                                 perror("Could not open backing file");
2681                                 free(uopt);
2682                                 return (-1);
2683                         }
2684                         sc->nvstore.type = NVME_STOR_BLOCKIF;
2685                         sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2686                 } else {
2687                         EPRINTLN("Invalid option %s", xopts);
2688                         free(uopt);
2689                         return (-1);
2690                 }
2691
2692                 optidx++;
2693         }
2694         free(uopt);
2695
2696         if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2697                 EPRINTLN("backing store not specified");
2698                 return (-1);
2699         }
2700         if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2701                 sc->nvstore.sectsz = sectsz;
2702         else if (sc->nvstore.type != NVME_STOR_RAM)
2703                 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2704         for (sc->nvstore.sectsz_bits = 9;
2705              (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2706              sc->nvstore.sectsz_bits++);
2707
2708         if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2709                 sc->max_queues = NVME_QUEUES;
2710
2711         if (sc->max_qentries <= 0) {
2712                 EPRINTLN("Invalid qsz option");
2713                 return (-1);
2714         }
2715         if (sc->ioslots <= 0) {
2716                 EPRINTLN("Invalid ioslots option");
2717                 return (-1);
2718         }
2719
2720         return (0);
2721 }
2722
2723 static int
2724 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2725 {
2726         struct pci_nvme_softc *sc;
2727         uint32_t pci_membar_sz;
2728         int     error;
2729
2730         error = 0;
2731
2732         sc = calloc(1, sizeof(struct pci_nvme_softc));
2733         pi->pi_arg = sc;
2734         sc->nsc_pi = pi;
2735
2736         error = pci_nvme_parse_opts(sc, opts);
2737         if (error < 0)
2738                 goto done;
2739         else
2740                 error = 0;
2741
2742         STAILQ_INIT(&sc->ioreqs_free);
2743         sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2744         for (int i = 0; i < sc->ioslots; i++) {
2745                 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2746         }
2747
2748         pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2749         pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2750         pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2751         pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2752         pci_set_cfgdata8(pi, PCIR_PROGIF,
2753                          PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2754
2755         /*
2756          * Allocate size of NVMe registers + doorbell space for all queues.
2757          *
2758          * The specification requires a minimum memory I/O window size of 16K.
2759          * The Windows driver will refuse to start a device with a smaller
2760          * window.
2761          */
2762         pci_membar_sz = sizeof(struct nvme_registers) +
2763             2 * sizeof(uint32_t) * (sc->max_queues + 1);
2764         pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2765
2766         DPRINTF("nvme membar size: %u", pci_membar_sz);
2767
2768         error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2769         if (error) {
2770                 WPRINTF("%s pci alloc mem bar failed", __func__);
2771                 goto done;
2772         }
2773
2774         error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2775         if (error) {
2776                 WPRINTF("%s pci add msixcap failed", __func__);
2777                 goto done;
2778         }
2779
2780         error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2781         if (error) {
2782                 WPRINTF("%s pci add Express capability failed", __func__);
2783                 goto done;
2784         }
2785
2786         pthread_mutex_init(&sc->mtx, NULL);
2787         sem_init(&sc->iosemlock, 0, sc->ioslots);
2788
2789         pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2790         /*
2791          * Controller data depends on Namespace data so initialize Namespace
2792          * data first.
2793          */
2794         pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2795         pci_nvme_init_ctrldata(sc);
2796         pci_nvme_init_logpages(sc);
2797         pci_nvme_init_features(sc);
2798
2799         pci_nvme_aer_init(sc);
2800
2801         pci_nvme_reset(sc);
2802
2803         pci_lintr_request(pi);
2804
2805 done:
2806         return (error);
2807 }
2808
2809
2810 struct pci_devemu pci_de_nvme = {
2811         .pe_emu =       "nvme",
2812         .pe_init =      pci_nvme_init,
2813         .pe_barwrite =  pci_nvme_write,
2814         .pe_barread =   pci_nvme_read
2815 };
2816 PCI_EMUL_SET(pci_de_nvme);