]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/bhyve/pci_nvme.c
Update to bmake-20200704
[FreeBSD/FreeBSD.git] / usr.sbin / bhyve / pci_nvme.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov 
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65
66 #include <assert.h>
67 #include <pthread.h>
68 #include <semaphore.h>
69 #include <stdbool.h>
70 #include <stddef.h>
71 #include <stdint.h>
72 #include <stdio.h>
73 #include <stdlib.h>
74 #include <string.h>
75
76 #include <machine/atomic.h>
77 #include <machine/vmm.h>
78 #include <vmmapi.h>
79
80 #include <dev/nvme/nvme.h>
81
82 #include "bhyverun.h"
83 #include "block_if.h"
84 #include "debug.h"
85 #include "pci_emul.h"
86
87
88 static int nvme_debug = 0;
89 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
90 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
91
92 /* defaults; can be overridden */
93 #define NVME_MSIX_BAR           4
94
95 #define NVME_IOSLOTS            8
96
97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
98 #define NVME_MMIO_SPACE_MIN     (1 << 14)
99
100 #define NVME_QUEUES             16
101 #define NVME_MAX_QENTRIES       2048
102 /* Memory Page size Minimum reported in CAP register */
103 #define NVME_MPSMIN             0
104 /* MPSMIN converted to bytes */
105 #define NVME_MPSMIN_BYTES       (1 << (12 + NVME_MPSMIN))
106
107 #define NVME_PRP2_ITEMS         (PAGE_SIZE/sizeof(uint64_t))
108 #define NVME_MDTS               9
109 /* Note the + 1 allows for the initial descriptor to not be page aligned */
110 #define NVME_MAX_IOVEC          ((1 << NVME_MDTS) + 1)
111 #define NVME_MAX_DATA_SIZE      ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
112
113 /* This is a synthetic status code to indicate there is no status */
114 #define NVME_NO_STATUS          0xffff
115 #define NVME_COMPLETION_VALID(c)        ((c).status != NVME_NO_STATUS)
116
117 /* helpers */
118
119 /* Convert a zero-based value into a one-based value */
120 #define ONE_BASED(zero)         ((zero) + 1)
121 /* Convert a one-based value into a zero-based value */
122 #define ZERO_BASED(one)         ((one)  - 1)
123
124 /* Encode number of SQ's and CQ's for Set/Get Features */
125 #define NVME_FEATURE_NUM_QUEUES(sc) \
126         (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
127         (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
128
129 #define NVME_DOORBELL_OFFSET    offsetof(struct nvme_registers, doorbell)
130
131 enum nvme_controller_register_offsets {
132         NVME_CR_CAP_LOW = 0x00,
133         NVME_CR_CAP_HI  = 0x04,
134         NVME_CR_VS      = 0x08,
135         NVME_CR_INTMS   = 0x0c,
136         NVME_CR_INTMC   = 0x10,
137         NVME_CR_CC      = 0x14,
138         NVME_CR_CSTS    = 0x1c,
139         NVME_CR_NSSR    = 0x20,
140         NVME_CR_AQA     = 0x24,
141         NVME_CR_ASQ_LOW = 0x28,
142         NVME_CR_ASQ_HI  = 0x2c,
143         NVME_CR_ACQ_LOW = 0x30,
144         NVME_CR_ACQ_HI  = 0x34,
145 };
146
147 enum nvme_cmd_cdw11 {
148         NVME_CMD_CDW11_PC  = 0x0001,
149         NVME_CMD_CDW11_IEN = 0x0002,
150         NVME_CMD_CDW11_IV  = 0xFFFF0000,
151 };
152
153 enum nvme_copy_dir {
154         NVME_COPY_TO_PRP,
155         NVME_COPY_FROM_PRP,
156 };
157
158 #define NVME_CQ_INTEN   0x01
159 #define NVME_CQ_INTCOAL 0x02
160
161 struct nvme_completion_queue {
162         struct nvme_completion *qbase;
163         pthread_mutex_t mtx;
164         uint32_t        size;
165         uint16_t        tail; /* nvme progress */
166         uint16_t        head; /* guest progress */
167         uint16_t        intr_vec;
168         uint32_t        intr_en;
169 };
170
171 struct nvme_submission_queue {
172         struct nvme_command *qbase;
173         pthread_mutex_t mtx;
174         uint32_t        size;
175         uint16_t        head; /* nvme progress */
176         uint16_t        tail; /* guest progress */
177         uint16_t        cqid; /* completion queue id */
178         int             qpriority;
179 };
180
181 enum nvme_storage_type {
182         NVME_STOR_BLOCKIF = 0,
183         NVME_STOR_RAM = 1,
184 };
185
186 struct pci_nvme_blockstore {
187         enum nvme_storage_type type;
188         void            *ctx;
189         uint64_t        size;
190         uint32_t        sectsz;
191         uint32_t        sectsz_bits;
192         uint64_t        eui64;
193         uint32_t        deallocate:1;
194 };
195
196 /*
197  * Calculate the number of additional page descriptors for guest IO requests
198  * based on the advertised Max Data Transfer (MDTS) and given the number of
199  * default iovec's in a struct blockif_req.
200  *
201  * Note the + 1 allows for the initial descriptor to not be page aligned.
202  */
203 #define MDTS_PAD_SIZE \
204         NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
205         NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
206         0
207
208 struct pci_nvme_ioreq {
209         struct pci_nvme_softc *sc;
210         STAILQ_ENTRY(pci_nvme_ioreq) link;
211         struct nvme_submission_queue *nvme_sq;
212         uint16_t        sqid;
213
214         /* command information */
215         uint16_t        opc;
216         uint16_t        cid;
217         uint32_t        nsid;
218
219         uint64_t        prev_gpaddr;
220         size_t          prev_size;
221         size_t          bytes;
222
223         struct blockif_req io_req;
224
225         struct iovec    iovpadding[MDTS_PAD_SIZE];
226 };
227
228 enum nvme_dsm_type {
229         /* Dataset Management bit in ONCS reflects backing storage capability */
230         NVME_DATASET_MANAGEMENT_AUTO,
231         /* Unconditionally set Dataset Management bit in ONCS */
232         NVME_DATASET_MANAGEMENT_ENABLE,
233         /* Unconditionally clear Dataset Management bit in ONCS */
234         NVME_DATASET_MANAGEMENT_DISABLE,
235 };
236
237 struct pci_nvme_softc;
238 struct nvme_feature_obj;
239
240 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
241     struct nvme_feature_obj *,
242     struct nvme_command *,
243     struct nvme_completion *);
244
245 struct nvme_feature_obj {
246         uint32_t        cdw11;
247         nvme_feature_cb set;
248         nvme_feature_cb get;
249         bool namespace_specific;
250 };
251
252 #define NVME_FID_MAX            (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
253
254 struct pci_nvme_aer {
255         STAILQ_ENTRY(pci_nvme_aer) link;
256         uint16_t        cid;    /* Command ID of the submitted AER */
257 };
258
259 struct pci_nvme_softc {
260         struct pci_devinst *nsc_pi;
261
262         pthread_mutex_t mtx;
263
264         struct nvme_registers regs;
265
266         struct nvme_namespace_data  nsdata;
267         struct nvme_controller_data ctrldata;
268         struct nvme_error_information_entry err_log;
269         struct nvme_health_information_page health_log;
270         struct nvme_firmware_page fw_log;
271
272         struct pci_nvme_blockstore nvstore;
273
274         uint16_t        max_qentries;   /* max entries per queue */
275         uint32_t        max_queues;     /* max number of IO SQ's or CQ's */
276         uint32_t        num_cqueues;
277         uint32_t        num_squeues;
278         bool            num_q_is_set; /* Has host set Number of Queues */
279
280         struct pci_nvme_ioreq *ioreqs;
281         STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
282         uint32_t        pending_ios;
283         uint32_t        ioslots;
284         sem_t           iosemlock;
285
286         /*
287          * Memory mapped Submission and Completion queues
288          * Each array includes both Admin and IO queues
289          */
290         struct nvme_completion_queue *compl_queues;
291         struct nvme_submission_queue *submit_queues;
292
293         struct nvme_feature_obj feat[NVME_FID_MAX];
294
295         enum nvme_dsm_type dataset_management;
296
297         /* Accounting for SMART data */
298         __uint128_t     read_data_units;
299         __uint128_t     write_data_units;
300         __uint128_t     read_commands;
301         __uint128_t     write_commands;
302         uint32_t        read_dunits_remainder;
303         uint32_t        write_dunits_remainder;
304
305         STAILQ_HEAD(, pci_nvme_aer) aer_list;
306         uint32_t        aer_count;
307 };
308
309
310 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
311 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
312 static void pci_nvme_io_done(struct blockif_req *, int);
313
314 /* Controller Configuration utils */
315 #define NVME_CC_GET_EN(cc) \
316         ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
317 #define NVME_CC_GET_CSS(cc) \
318         ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
319 #define NVME_CC_GET_SHN(cc) \
320         ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
321 #define NVME_CC_GET_IOSQES(cc) \
322         ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
323 #define NVME_CC_GET_IOCQES(cc) \
324         ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
325
326 #define NVME_CC_WRITE_MASK \
327         ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
328          (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
329          (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
330
331 #define NVME_CC_NEN_WRITE_MASK \
332         ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
333          (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
334          (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
335
336 /* Controller Status utils */
337 #define NVME_CSTS_GET_RDY(sts) \
338         ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
339
340 #define NVME_CSTS_RDY   (1 << NVME_CSTS_REG_RDY_SHIFT)
341
342 /* Completion Queue status word utils */
343 #define NVME_STATUS_P   (1 << NVME_STATUS_P_SHIFT)
344 #define NVME_STATUS_MASK \
345         ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
346          (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
347
348 #define NVME_ONCS_DSM   (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
349         NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
350
351 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
352     struct nvme_feature_obj *,
353     struct nvme_command *,
354     struct nvme_completion *);
355 static void nvme_feature_num_queues(struct pci_nvme_softc *,
356     struct nvme_feature_obj *,
357     struct nvme_command *,
358     struct nvme_completion *);
359 static void nvme_feature_iv_config(struct pci_nvme_softc *,
360     struct nvme_feature_obj *,
361     struct nvme_command *,
362     struct nvme_completion *);
363
364 static __inline void
365 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
366 {
367         size_t len;
368
369         len = strnlen(src, dst_size);
370         memset(dst, pad, dst_size);
371         memcpy(dst, src, len);
372 }
373
374 static __inline void
375 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
376 {
377
378         *status &= ~NVME_STATUS_MASK;
379         *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
380                 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
381 }
382
383 static __inline void
384 pci_nvme_status_genc(uint16_t *status, uint16_t code)
385 {
386
387         pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
388 }
389
390 /*
391  * Initialize the requested number or IO Submission and Completion Queues.
392  * Admin queues are allocated implicitly.
393  */
394 static void
395 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
396 {
397         uint32_t i;
398
399         /*
400          * Allocate and initialize the Submission Queues
401          */
402         if (nsq > NVME_QUEUES) {
403                 WPRINTF("%s: clamping number of SQ from %u to %u",
404                                         __func__, nsq, NVME_QUEUES);
405                 nsq = NVME_QUEUES;
406         }
407
408         sc->num_squeues = nsq;
409
410         sc->submit_queues = calloc(sc->num_squeues + 1,
411                                 sizeof(struct nvme_submission_queue));
412         if (sc->submit_queues == NULL) {
413                 WPRINTF("%s: SQ allocation failed", __func__);
414                 sc->num_squeues = 0;
415         } else {
416                 struct nvme_submission_queue *sq = sc->submit_queues;
417
418                 for (i = 0; i < sc->num_squeues; i++)
419                         pthread_mutex_init(&sq[i].mtx, NULL);
420         }
421
422         /*
423          * Allocate and initialize the Completion Queues
424          */
425         if (ncq > NVME_QUEUES) {
426                 WPRINTF("%s: clamping number of CQ from %u to %u",
427                                         __func__, ncq, NVME_QUEUES);
428                 ncq = NVME_QUEUES;
429         }
430
431         sc->num_cqueues = ncq;
432
433         sc->compl_queues = calloc(sc->num_cqueues + 1,
434                                 sizeof(struct nvme_completion_queue));
435         if (sc->compl_queues == NULL) {
436                 WPRINTF("%s: CQ allocation failed", __func__);
437                 sc->num_cqueues = 0;
438         } else {
439                 struct nvme_completion_queue *cq = sc->compl_queues;
440
441                 for (i = 0; i < sc->num_cqueues; i++)
442                         pthread_mutex_init(&cq[i].mtx, NULL);
443         }
444 }
445
446 static void
447 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
448 {
449         struct nvme_controller_data *cd = &sc->ctrldata;
450
451         cd->vid = 0xFB5D;
452         cd->ssvid = 0x0000;
453
454         cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
455         cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
456
457         /* Num of submission commands that we can handle at a time (2^rab) */
458         cd->rab   = 4;
459
460         /* FreeBSD OUI */
461         cd->ieee[0] = 0x58;
462         cd->ieee[1] = 0x9c;
463         cd->ieee[2] = 0xfc;
464
465         cd->mic = 0;
466
467         cd->mdts = NVME_MDTS;   /* max data transfer size (2^mdts * CAP.MPSMIN) */
468
469         cd->ver = 0x00010300;
470
471         cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
472         cd->acl = 2;
473         cd->aerl = 4;
474
475         /* Advertise 1, Read-only firmware slot */
476         cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
477             (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
478         cd->lpa = 0;    /* TODO: support some simple things like SMART */
479         cd->elpe = 0;   /* max error log page entries */
480         cd->npss = 1;   /* number of power states support */
481
482         /* Warning Composite Temperature Threshold */
483         cd->wctemp = 0x0157;
484
485         cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
486             (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
487         cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
488             (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
489         cd->nn = 1;     /* number of namespaces */
490
491         cd->oncs = 0;
492         switch (sc->dataset_management) {
493         case NVME_DATASET_MANAGEMENT_AUTO:
494                 if (sc->nvstore.deallocate)
495                         cd->oncs |= NVME_ONCS_DSM;
496                 break;
497         case NVME_DATASET_MANAGEMENT_ENABLE:
498                 cd->oncs |= NVME_ONCS_DSM;
499                 break;
500         default:
501                 break;
502         }
503
504         cd->fna = 0x03;
505
506         cd->power_state[0].mp = 10;
507 }
508
509 /*
510  * Calculate the CRC-16 of the given buffer
511  * See copyright attribution at top of file
512  */
513 static uint16_t
514 crc16(uint16_t crc, const void *buffer, unsigned int len)
515 {
516         const unsigned char *cp = buffer;
517         /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
518         static uint16_t const crc16_table[256] = {
519                 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
520                 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
521                 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
522                 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
523                 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
524                 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
525                 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
526                 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
527                 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
528                 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
529                 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
530                 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
531                 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
532                 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
533                 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
534                 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
535                 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
536                 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
537                 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
538                 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
539                 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
540                 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
541                 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
542                 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
543                 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
544                 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
545                 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
546                 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
547                 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
548                 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
549                 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
550                 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
551         };
552
553         while (len--)
554                 crc = (((crc >> 8) & 0xffU) ^
555                     crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
556         return crc;
557 }
558
559 static void
560 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
561     struct nvme_namespace_data *nd, uint32_t nsid,
562     struct pci_nvme_blockstore *nvstore)
563 {
564
565         /* Get capacity and block size information from backing store */
566         nd->nsze = nvstore->size / nvstore->sectsz;
567         nd->ncap = nd->nsze;
568         nd->nuse = nd->nsze;
569
570         if (nvstore->type == NVME_STOR_BLOCKIF)
571                 nvstore->deallocate = blockif_candelete(nvstore->ctx);
572
573         nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
574         nd->flbas = 0;
575
576         /* Create an EUI-64 if user did not provide one */
577         if (nvstore->eui64 == 0) {
578                 char *data = NULL;
579                 uint64_t eui64 = nvstore->eui64;
580
581                 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
582                     sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
583
584                 if (data != NULL) {
585                         eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
586                         free(data);
587                 }
588                 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
589         }
590         be64enc(nd->eui64, nvstore->eui64);
591
592         /* LBA data-sz = 2^lbads */
593         nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
594 }
595
596 static void
597 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
598 {
599
600         memset(&sc->err_log, 0, sizeof(sc->err_log));
601         memset(&sc->health_log, 0, sizeof(sc->health_log));
602         memset(&sc->fw_log, 0, sizeof(sc->fw_log));
603
604         /* Set read/write remainder to round up according to spec */
605         sc->read_dunits_remainder = 999;
606         sc->write_dunits_remainder = 999;
607 }
608
609 static void
610 pci_nvme_init_features(struct pci_nvme_softc *sc)
611 {
612
613         sc->feat[0].set = nvme_feature_invalid_cb;
614         sc->feat[0].get = nvme_feature_invalid_cb;
615
616         sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
617         sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
618         sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
619         sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set =
620             nvme_feature_iv_config;
621         sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get =
622             nvme_feature_invalid_cb;
623         sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get =
624             nvme_feature_invalid_cb;
625 }
626
627 static void
628 pci_nvme_aer_init(struct pci_nvme_softc *sc)
629 {
630
631         STAILQ_INIT(&sc->aer_list);
632         sc->aer_count = 0;
633 }
634
635 static void
636 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
637 {
638         struct pci_nvme_aer *aer = NULL;
639
640         while (!STAILQ_EMPTY(&sc->aer_list)) {
641                 aer = STAILQ_FIRST(&sc->aer_list);
642                 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
643                 free(aer);
644         }
645
646         pci_nvme_aer_init(sc);
647 }
648
649 static bool
650 pci_nvme_aer_available(struct pci_nvme_softc *sc)
651 {
652
653         return (!STAILQ_EMPTY(&sc->aer_list));
654 }
655
656 static bool
657 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
658 {
659         struct nvme_controller_data *cd = &sc->ctrldata;
660
661         /* AERL is a zero based value while aer_count is one's based */
662         return (sc->aer_count == (cd->aerl + 1));
663 }
664
665 /*
666  * Add an Async Event Request
667  *
668  * Stores an AER to be returned later if the Controller needs to notify the
669  * host of an event.
670  * Note that while the NVMe spec doesn't require Controllers to return AER's
671  * in order, this implementation does preserve the order.
672  */
673 static int
674 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
675 {
676         struct pci_nvme_aer *aer = NULL;
677
678         if (pci_nvme_aer_limit_reached(sc))
679                 return (-1);
680
681         aer = calloc(1, sizeof(struct pci_nvme_aer));
682         if (aer == NULL)
683                 return (-1);
684
685         sc->aer_count++;
686
687         /* Save the Command ID for use in the completion message */
688         aer->cid = cid;
689         STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
690
691         return (0);
692 }
693
694 /*
695  * Get an Async Event Request structure
696  *
697  * Returns a pointer to an AER previously submitted by the host or NULL if
698  * no AER's exist. Caller is responsible for freeing the returned struct.
699  */
700 static struct pci_nvme_aer *
701 pci_nvme_aer_get(struct pci_nvme_softc *sc)
702 {
703         struct pci_nvme_aer *aer = NULL;
704
705         aer = STAILQ_FIRST(&sc->aer_list);
706         if (aer != NULL) {
707                 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
708                 sc->aer_count--;
709         }
710         
711         return (aer);
712 }
713
714 static void
715 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
716 {
717         uint32_t i;
718
719         DPRINTF("%s", __func__);
720
721         sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
722             (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
723             (60 << NVME_CAP_LO_REG_TO_SHIFT);
724
725         sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
726
727         sc->regs.vs = 0x00010300;       /* NVMe v1.3 */
728
729         sc->regs.cc = 0;
730         sc->regs.csts = 0;
731
732         assert(sc->submit_queues != NULL);
733
734         for (i = 0; i < sc->num_squeues + 1; i++) {
735                 sc->submit_queues[i].qbase = NULL;
736                 sc->submit_queues[i].size = 0;
737                 sc->submit_queues[i].cqid = 0;
738                 sc->submit_queues[i].tail = 0;
739                 sc->submit_queues[i].head = 0;
740         }
741
742         assert(sc->compl_queues != NULL);
743
744         for (i = 0; i < sc->num_cqueues + 1; i++) {
745                 sc->compl_queues[i].qbase = NULL;
746                 sc->compl_queues[i].size = 0;
747                 sc->compl_queues[i].tail = 0;
748                 sc->compl_queues[i].head = 0;
749         }
750
751         sc->num_q_is_set = false;
752
753         pci_nvme_aer_destroy(sc);
754 }
755
756 static void
757 pci_nvme_reset(struct pci_nvme_softc *sc)
758 {
759         pthread_mutex_lock(&sc->mtx);
760         pci_nvme_reset_locked(sc);
761         pthread_mutex_unlock(&sc->mtx);
762 }
763
764 static void
765 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
766 {
767         uint16_t acqs, asqs;
768
769         DPRINTF("%s", __func__);
770
771         asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
772         sc->submit_queues[0].size = asqs;
773         sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
774                     sizeof(struct nvme_command) * asqs);
775
776         DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
777                 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
778
779         acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 
780             NVME_AQA_REG_ACQS_MASK) + 1;
781         sc->compl_queues[0].size = acqs;
782         sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
783                  sizeof(struct nvme_completion) * acqs);
784         sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
785
786         DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
787                 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
788 }
789
790 static int
791 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
792         size_t len, enum nvme_copy_dir dir)
793 {
794         uint8_t *p;
795         size_t bytes;
796
797         if (len > (8 * 1024)) {
798                 return (-1);
799         }
800
801         /* Copy from the start of prp1 to the end of the physical page */
802         bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
803         bytes = MIN(bytes, len);
804
805         p = vm_map_gpa(ctx, prp1, bytes);
806         if (p == NULL) {
807                 return (-1);
808         }
809
810         if (dir == NVME_COPY_TO_PRP)
811                 memcpy(p, b, bytes);
812         else
813                 memcpy(b, p, bytes);
814
815         b += bytes;
816
817         len -= bytes;
818         if (len == 0) {
819                 return (0);
820         }
821
822         len = MIN(len, PAGE_SIZE);
823
824         p = vm_map_gpa(ctx, prp2, len);
825         if (p == NULL) {
826                 return (-1);
827         }
828
829         if (dir == NVME_COPY_TO_PRP)
830                 memcpy(p, b, len);
831         else
832                 memcpy(b, p, len);
833
834         return (0);
835 }
836
837 /*
838  * Write a Completion Queue Entry update
839  *
840  * Write the completion and update the doorbell value
841  */
842 static void
843 pci_nvme_cq_update(struct pci_nvme_softc *sc,
844                 struct nvme_completion_queue *cq,
845                 uint32_t cdw0,
846                 uint16_t cid,
847                 uint16_t sqid,
848                 uint16_t status)
849 {
850         struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
851         struct nvme_completion *cqe;
852
853         assert(cq->qbase != NULL);
854
855         pthread_mutex_lock(&cq->mtx);
856
857         cqe = &cq->qbase[cq->tail];
858
859         /* Flip the phase bit */
860         status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
861
862         cqe->cdw0 = cdw0;
863         cqe->sqhd = sq->head;
864         cqe->sqid = sqid;
865         cqe->cid = cid;
866         cqe->status = status;
867
868         cq->tail++;
869         if (cq->tail >= cq->size) {
870                 cq->tail = 0;
871         }
872
873         pthread_mutex_unlock(&cq->mtx);
874 }
875
876 static int
877 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
878         struct nvme_completion* compl)
879 {
880         uint16_t qid = command->cdw10 & 0xffff;
881
882         DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
883         if (qid == 0 || qid > sc->num_squeues ||
884             (sc->submit_queues[qid].qbase == NULL)) {
885                 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
886                         __func__, qid, sc->num_squeues);
887                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
888                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
889                 return (1);
890         }
891
892         sc->submit_queues[qid].qbase = NULL;
893         sc->submit_queues[qid].cqid = 0;
894         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
895         return (1);
896 }
897
898 static int
899 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
900         struct nvme_completion* compl)
901 {
902         if (command->cdw11 & NVME_CMD_CDW11_PC) {
903                 uint16_t qid = command->cdw10 & 0xffff;
904                 struct nvme_submission_queue *nsq;
905
906                 if ((qid == 0) || (qid > sc->num_squeues) ||
907                     (sc->submit_queues[qid].qbase != NULL)) {
908                         WPRINTF("%s queue index %u > num_squeues %u",
909                                 __func__, qid, sc->num_squeues);
910                         pci_nvme_status_tc(&compl->status,
911                             NVME_SCT_COMMAND_SPECIFIC,
912                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
913                         return (1);
914                 }
915
916                 nsq = &sc->submit_queues[qid];
917                 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
918                 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
919                 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
920                         /*
921                          * Queues must specify at least two entries
922                          * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
923                          * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
924                          */
925                         pci_nvme_status_tc(&compl->status,
926                             NVME_SCT_COMMAND_SPECIFIC,
927                             NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
928                         return (1);
929                 }
930
931                 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
932                 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
933                         pci_nvme_status_tc(&compl->status,
934                             NVME_SCT_COMMAND_SPECIFIC,
935                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
936                         return (1);
937                 }
938
939                 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
940                         pci_nvme_status_tc(&compl->status,
941                             NVME_SCT_COMMAND_SPECIFIC,
942                             NVME_SC_COMPLETION_QUEUE_INVALID);
943                         return (1);
944                 }
945
946                 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
947
948                 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
949                               sizeof(struct nvme_command) * (size_t)nsq->size);
950
951                 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
952                         qid, nsq->size, nsq->qbase, nsq->cqid);
953
954                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
955
956                 DPRINTF("%s completed creating IOSQ qid %u",
957                          __func__, qid);
958         } else {
959                 /* 
960                  * Guest sent non-cont submission queue request.
961                  * This setting is unsupported by this emulation.
962                  */
963                 WPRINTF("%s unsupported non-contig (list-based) "
964                          "create i/o submission queue", __func__);
965
966                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
967         }
968         return (1);
969 }
970
971 static int
972 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
973         struct nvme_completion* compl)
974 {
975         uint16_t qid = command->cdw10 & 0xffff;
976         uint16_t sqid;
977
978         DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
979         if (qid == 0 || qid > sc->num_cqueues ||
980             (sc->compl_queues[qid].qbase == NULL)) {
981                 WPRINTF("%s queue index %u / num_cqueues %u",
982                         __func__, qid, sc->num_cqueues);
983                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
984                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
985                 return (1);
986         }
987
988         /* Deleting an Active CQ is an error */
989         for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
990                 if (sc->submit_queues[sqid].cqid == qid) {
991                         pci_nvme_status_tc(&compl->status,
992                             NVME_SCT_COMMAND_SPECIFIC,
993                             NVME_SC_INVALID_QUEUE_DELETION);
994                         return (1);
995                 }
996
997         sc->compl_queues[qid].qbase = NULL;
998         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
999         return (1);
1000 }
1001
1002 static int
1003 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1004         struct nvme_completion* compl)
1005 {
1006         struct nvme_completion_queue *ncq;
1007         uint16_t qid = command->cdw10 & 0xffff;
1008
1009         /* Only support Physically Contiguous queues */
1010         if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1011                 WPRINTF("%s unsupported non-contig (list-based) "
1012                          "create i/o completion queue",
1013                          __func__);
1014
1015                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1016                 return (1);
1017         }
1018
1019         if ((qid == 0) || (qid > sc->num_cqueues) ||
1020             (sc->compl_queues[qid].qbase != NULL)) {
1021                 WPRINTF("%s queue index %u > num_cqueues %u",
1022                         __func__, qid, sc->num_cqueues);
1023                 pci_nvme_status_tc(&compl->status,
1024                     NVME_SCT_COMMAND_SPECIFIC,
1025                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
1026                 return (1);
1027         }
1028
1029         ncq = &sc->compl_queues[qid];
1030         ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1031         ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1032         if (ncq->intr_vec > (sc->max_queues + 1)) {
1033                 pci_nvme_status_tc(&compl->status,
1034                     NVME_SCT_COMMAND_SPECIFIC,
1035                     NVME_SC_INVALID_INTERRUPT_VECTOR);
1036                 return (1);
1037         }
1038
1039         ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1040         if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1041                 /*
1042                  * Queues must specify at least two entries
1043                  * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1044                  * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1045                  */
1046                 pci_nvme_status_tc(&compl->status,
1047                     NVME_SCT_COMMAND_SPECIFIC,
1048                     NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1049                 return (1);
1050         }
1051         ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1052                      command->prp1,
1053                      sizeof(struct nvme_command) * (size_t)ncq->size);
1054
1055         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1056
1057
1058         return (1);
1059 }
1060
1061 static int
1062 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1063         struct nvme_completion* compl)
1064 {
1065         uint32_t logsize;
1066         uint8_t logpage = command->cdw10 & 0xFF;
1067
1068         DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1069
1070         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1071
1072         /*
1073          * Command specifies the number of dwords to return in fields NUMDU
1074          * and NUMDL. This is a zero-based value.
1075          */
1076         logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1077         logsize *= sizeof(uint32_t);
1078
1079         switch (logpage) {
1080         case NVME_LOG_ERROR:
1081                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1082                     command->prp2, (uint8_t *)&sc->err_log,
1083                     MIN(logsize, sizeof(sc->err_log)),
1084                     NVME_COPY_TO_PRP);
1085                 break;
1086         case NVME_LOG_HEALTH_INFORMATION:
1087                 pthread_mutex_lock(&sc->mtx);
1088                 memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1089                     sizeof(sc->health_log.data_units_read));
1090                 memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1091                     sizeof(sc->health_log.data_units_written));
1092                 memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1093                     sizeof(sc->health_log.host_read_commands));
1094                 memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1095                     sizeof(sc->health_log.host_write_commands));
1096                 pthread_mutex_unlock(&sc->mtx);
1097
1098                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1099                     command->prp2, (uint8_t *)&sc->health_log,
1100                     MIN(logsize, sizeof(sc->health_log)),
1101                     NVME_COPY_TO_PRP);
1102                 break;
1103         case NVME_LOG_FIRMWARE_SLOT:
1104                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1105                     command->prp2, (uint8_t *)&sc->fw_log,
1106                     MIN(logsize, sizeof(sc->fw_log)),
1107                     NVME_COPY_TO_PRP);
1108                 break;
1109         default:
1110                 DPRINTF("%s get log page %x command not supported",
1111                         __func__, logpage);
1112
1113                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1114                     NVME_SC_INVALID_LOG_PAGE);
1115         }
1116
1117         return (1);
1118 }
1119
1120 static int
1121 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1122         struct nvme_completion* compl)
1123 {
1124         void *dest;
1125         uint16_t status;
1126
1127         DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1128                 command->cdw10 & 0xFF, command->nsid);
1129
1130         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1131
1132         switch (command->cdw10 & 0xFF) {
1133         case 0x00: /* return Identify Namespace data structure */
1134                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1135                     command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1136                     NVME_COPY_TO_PRP);
1137                 break;
1138         case 0x01: /* return Identify Controller data structure */
1139                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1140                     command->prp2, (uint8_t *)&sc->ctrldata,
1141                     sizeof(sc->ctrldata),
1142                     NVME_COPY_TO_PRP);
1143                 break;
1144         case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1145                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1146                                   sizeof(uint32_t) * 1024);
1147                 /* All unused entries shall be zero */
1148                 bzero(dest, sizeof(uint32_t) * 1024);
1149                 ((uint32_t *)dest)[0] = 1;
1150                 break;
1151         case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1152                 if (command->nsid != 1) {
1153                         pci_nvme_status_genc(&status,
1154                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1155                         break;
1156                 }
1157                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1158                                   sizeof(uint32_t) * 1024);
1159                 /* All bytes after the descriptor shall be zero */
1160                 bzero(dest, sizeof(uint32_t) * 1024);
1161
1162                 /* Return NIDT=1 (i.e. EUI64) descriptor */
1163                 ((uint8_t *)dest)[0] = 1;
1164                 ((uint8_t *)dest)[1] = sizeof(uint64_t);
1165                 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1166                 break;
1167         default:
1168                 DPRINTF("%s unsupported identify command requested 0x%x",
1169                          __func__, command->cdw10 & 0xFF);
1170                 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1171                 break;
1172         }
1173
1174         compl->status = status;
1175         return (1);
1176 }
1177
1178 static const char *
1179 nvme_fid_to_name(uint8_t fid)
1180 {
1181         const char *name;
1182
1183         switch (fid) {
1184         case NVME_FEAT_ARBITRATION:
1185                 name = "Arbitration";
1186                 break;
1187         case NVME_FEAT_POWER_MANAGEMENT:
1188                 name = "Power Management";
1189                 break;
1190         case NVME_FEAT_LBA_RANGE_TYPE:
1191                 name = "LBA Range Type";
1192                 break;
1193         case NVME_FEAT_TEMPERATURE_THRESHOLD:
1194                 name = "Temperature Threshold";
1195                 break;
1196         case NVME_FEAT_ERROR_RECOVERY:
1197                 name = "Error Recovery";
1198                 break;
1199         case NVME_FEAT_VOLATILE_WRITE_CACHE:
1200                 name = "Volatile Write Cache";
1201                 break;
1202         case NVME_FEAT_NUMBER_OF_QUEUES:
1203                 name = "Number of Queues";
1204                 break;
1205         case NVME_FEAT_INTERRUPT_COALESCING:
1206                 name = "Interrupt Coalescing";
1207                 break;
1208         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1209                 name = "Interrupt Vector Configuration";
1210                 break;
1211         case NVME_FEAT_WRITE_ATOMICITY:
1212                 name = "Write Atomicity Normal";
1213                 break;
1214         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1215                 name = "Asynchronous Event Configuration";
1216                 break;
1217         case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1218                 name = "Autonomous Power State Transition";
1219                 break;
1220         case NVME_FEAT_HOST_MEMORY_BUFFER:
1221                 name = "Host Memory Buffer";
1222                 break;
1223         case NVME_FEAT_TIMESTAMP:
1224                 name = "Timestamp";
1225                 break;
1226         case NVME_FEAT_KEEP_ALIVE_TIMER:
1227                 name = "Keep Alive Timer";
1228                 break;
1229         case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1230                 name = "Host Controlled Thermal Management";
1231                 break;
1232         case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1233                 name = "Non-Operation Power State Config";
1234                 break;
1235         case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1236                 name = "Read Recovery Level Config";
1237                 break;
1238         case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1239                 name = "Predictable Latency Mode Config";
1240                 break;
1241         case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1242                 name = "Predictable Latency Mode Window";
1243                 break;
1244         case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1245                 name = "LBA Status Information Report Interval";
1246                 break;
1247         case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1248                 name = "Host Behavior Support";
1249                 break;
1250         case NVME_FEAT_SANITIZE_CONFIG:
1251                 name = "Sanitize Config";
1252                 break;
1253         case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1254                 name = "Endurance Group Event Configuration";
1255                 break;
1256         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1257                 name = "Software Progress Marker";
1258                 break;
1259         case NVME_FEAT_HOST_IDENTIFIER:
1260                 name = "Host Identifier";
1261                 break;
1262         case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1263                 name = "Reservation Notification Mask";
1264                 break;
1265         case NVME_FEAT_RESERVATION_PERSISTENCE:
1266                 name = "Reservation Persistence";
1267                 break;
1268         case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1269                 name = "Namespace Write Protection Config";
1270                 break;
1271         default:
1272                 name = "Unknown";
1273                 break;
1274         }
1275
1276         return (name);
1277 }
1278
1279 static void
1280 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1281     struct nvme_feature_obj *feat,
1282     struct nvme_command *command,
1283     struct nvme_completion *compl)
1284 {
1285
1286         pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1287 }
1288
1289 static void
1290 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1291     struct nvme_feature_obj *feat,
1292     struct nvme_command *command,
1293     struct nvme_completion *compl)
1294 {
1295         uint32_t i;
1296         uint32_t cdw11 = command->cdw11;
1297         uint16_t iv;
1298         bool cd;
1299
1300         pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1301
1302         iv = cdw11 & 0xffff;
1303         cd = cdw11 & (1 << 16);
1304
1305         if (iv > (sc->max_queues + 1)) {
1306                 return;
1307         }
1308
1309         /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1310         if ((iv == 0) && !cd)
1311                 return;
1312
1313         /* Requested Interrupt Vector must be used by a CQ */
1314         for (i = 0; i < sc->num_cqueues + 1; i++) {
1315                 if (sc->compl_queues[i].intr_vec == iv) {
1316                         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1317                 }
1318         }
1319
1320 }
1321
1322 static void
1323 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1324     struct nvme_feature_obj *feat,
1325     struct nvme_command *command,
1326     struct nvme_completion *compl)
1327 {
1328         uint16_t nqr;   /* Number of Queues Requested */
1329
1330         if (sc->num_q_is_set) {
1331                 WPRINTF("%s: Number of Queues already set", __func__);
1332                 pci_nvme_status_genc(&compl->status,
1333                     NVME_SC_COMMAND_SEQUENCE_ERROR);
1334                 return;
1335         }
1336
1337         nqr = command->cdw11 & 0xFFFF;
1338         if (nqr == 0xffff) {
1339                 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1340                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1341                 return;
1342         }
1343
1344         sc->num_squeues = ONE_BASED(nqr);
1345         if (sc->num_squeues > sc->max_queues) {
1346                 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1347                                         sc->max_queues);
1348                 sc->num_squeues = sc->max_queues;
1349         }
1350
1351         nqr = (command->cdw11 >> 16) & 0xFFFF;
1352         if (nqr == 0xffff) {
1353                 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1354                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1355                 return;
1356         }
1357
1358         sc->num_cqueues = ONE_BASED(nqr);
1359         if (sc->num_cqueues > sc->max_queues) {
1360                 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1361                                         sc->max_queues);
1362                 sc->num_cqueues = sc->max_queues;
1363         }
1364
1365         /* Patch the command value which will be saved on callback's return */
1366         command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1367         compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1368
1369         sc->num_q_is_set = true;
1370 }
1371
1372 static int
1373 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1374         struct nvme_completion *compl)
1375 {
1376         struct nvme_feature_obj *feat;
1377         uint32_t nsid = command->nsid;
1378         uint8_t fid = command->cdw10 & 0xFF;
1379
1380         DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1381
1382         if (fid >= NVME_FID_MAX) {
1383                 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1384                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1385                 return (1);
1386         }
1387         feat = &sc->feat[fid];
1388
1389         if (!feat->namespace_specific &&
1390             !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1391                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1392                     NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1393                 return (1);
1394         }
1395
1396         compl->cdw0 = 0;
1397         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1398
1399         if (feat->set)
1400                 feat->set(sc, feat, command, compl);
1401
1402         if (compl->status == NVME_SC_SUCCESS)
1403                 feat->cdw11 = command->cdw11;
1404
1405         return (0);
1406 }
1407
1408 static int
1409 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1410         struct nvme_completion* compl)
1411 {
1412         struct nvme_feature_obj *feat;
1413         uint8_t fid = command->cdw10 & 0xFF;
1414
1415         DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1416
1417         if (fid >= NVME_FID_MAX) {
1418                 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1419                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1420                 return (1);
1421         }
1422
1423         compl->cdw0 = 0;
1424         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1425
1426         feat = &sc->feat[fid];
1427         if (feat->get) {
1428                 feat->get(sc, feat, command, compl);
1429         }
1430
1431         if (compl->status == NVME_SC_SUCCESS) {
1432                 compl->cdw0 = feat->cdw11;
1433         }
1434
1435         return (0);
1436 }
1437
1438 static int
1439 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1440         struct nvme_completion* compl)
1441 {
1442         uint8_t ses, lbaf, pi;
1443
1444         /* Only supports Secure Erase Setting - User Data Erase */
1445         ses = (command->cdw10 >> 9) & 0x7;
1446         if (ses > 0x1) {
1447                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1448                 return (1);
1449         }
1450
1451         /* Only supports a single LBA Format */
1452         lbaf = command->cdw10 & 0xf;
1453         if (lbaf != 0) {
1454                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1455                     NVME_SC_INVALID_FORMAT);
1456                 return (1);
1457         }
1458
1459         /* Doesn't support Protection Infomation */
1460         pi = (command->cdw10 >> 5) & 0x7;
1461         if (pi != 0) {
1462                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1463                 return (1);
1464         }
1465
1466         if (sc->nvstore.type == NVME_STOR_RAM) {
1467                 if (sc->nvstore.ctx)
1468                         free(sc->nvstore.ctx);
1469                 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1470                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1471         } else {
1472                 struct pci_nvme_ioreq *req;
1473                 int err;
1474
1475                 req = pci_nvme_get_ioreq(sc);
1476                 if (req == NULL) {
1477                         pci_nvme_status_genc(&compl->status,
1478                             NVME_SC_INTERNAL_DEVICE_ERROR);
1479                         WPRINTF("%s: unable to allocate IO req", __func__);
1480                         return (1);
1481                 }
1482                 req->nvme_sq = &sc->submit_queues[0];
1483                 req->sqid = 0;
1484                 req->opc = command->opc;
1485                 req->cid = command->cid;
1486                 req->nsid = command->nsid;
1487
1488                 req->io_req.br_offset = 0;
1489                 req->io_req.br_resid = sc->nvstore.size;
1490                 req->io_req.br_callback = pci_nvme_io_done;
1491
1492                 err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1493                 if (err) {
1494                         pci_nvme_status_genc(&compl->status,
1495                             NVME_SC_INTERNAL_DEVICE_ERROR);
1496                         pci_nvme_release_ioreq(sc, req);
1497                 }
1498         }
1499
1500         return (1);
1501 }
1502
1503 static int
1504 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1505         struct nvme_completion* compl)
1506 {
1507         DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1508                 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1509
1510         /* TODO: search for the command ID and abort it */
1511
1512         compl->cdw0 = 1;
1513         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1514         return (1);
1515 }
1516
1517 static int
1518 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1519         struct nvme_command* command, struct nvme_completion* compl)
1520 {
1521         DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1522
1523         /* Don't exceed the Async Event Request Limit (AERL). */
1524         if (pci_nvme_aer_limit_reached(sc)) {
1525                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1526                                 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1527                 return (1);
1528         }
1529
1530         if (pci_nvme_aer_add(sc, command->cid)) {
1531                 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1532                                 NVME_SC_INTERNAL_DEVICE_ERROR);
1533                 return (1);
1534         }
1535
1536         /*
1537          * Raise events when they happen based on the Set Features cmd.
1538          * These events happen async, so only set completion successful if
1539          * there is an event reflective of the request to get event.
1540          */
1541         compl->status = NVME_NO_STATUS;
1542
1543         return (0);
1544 }
1545
1546 static void
1547 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1548 {
1549         struct nvme_completion compl;
1550         struct nvme_command *cmd;
1551         struct nvme_submission_queue *sq;
1552         struct nvme_completion_queue *cq;
1553         uint16_t sqhead;
1554
1555         DPRINTF("%s index %u", __func__, (uint32_t)value);
1556
1557         sq = &sc->submit_queues[0];
1558         cq = &sc->compl_queues[0];
1559
1560         pthread_mutex_lock(&sq->mtx);
1561
1562         sqhead = sq->head;
1563         DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1564         
1565         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1566                 cmd = &(sq->qbase)[sqhead];
1567                 compl.cdw0 = 0;
1568                 compl.status = 0;
1569
1570                 switch (cmd->opc) {
1571                 case NVME_OPC_DELETE_IO_SQ:
1572                         DPRINTF("%s command DELETE_IO_SQ", __func__);
1573                         nvme_opc_delete_io_sq(sc, cmd, &compl);
1574                         break;
1575                 case NVME_OPC_CREATE_IO_SQ:
1576                         DPRINTF("%s command CREATE_IO_SQ", __func__);
1577                         nvme_opc_create_io_sq(sc, cmd, &compl);
1578                         break;
1579                 case NVME_OPC_DELETE_IO_CQ:
1580                         DPRINTF("%s command DELETE_IO_CQ", __func__);
1581                         nvme_opc_delete_io_cq(sc, cmd, &compl);
1582                         break;
1583                 case NVME_OPC_CREATE_IO_CQ:
1584                         DPRINTF("%s command CREATE_IO_CQ", __func__);
1585                         nvme_opc_create_io_cq(sc, cmd, &compl);
1586                         break;
1587                 case NVME_OPC_GET_LOG_PAGE:
1588                         DPRINTF("%s command GET_LOG_PAGE", __func__);
1589                         nvme_opc_get_log_page(sc, cmd, &compl);
1590                         break;
1591                 case NVME_OPC_IDENTIFY:
1592                         DPRINTF("%s command IDENTIFY", __func__);
1593                         nvme_opc_identify(sc, cmd, &compl);
1594                         break;
1595                 case NVME_OPC_ABORT:
1596                         DPRINTF("%s command ABORT", __func__);
1597                         nvme_opc_abort(sc, cmd, &compl);
1598                         break;
1599                 case NVME_OPC_SET_FEATURES:
1600                         DPRINTF("%s command SET_FEATURES", __func__);
1601                         nvme_opc_set_features(sc, cmd, &compl);
1602                         break;
1603                 case NVME_OPC_GET_FEATURES:
1604                         DPRINTF("%s command GET_FEATURES", __func__);
1605                         nvme_opc_get_features(sc, cmd, &compl);
1606                         break;
1607                 case NVME_OPC_FIRMWARE_ACTIVATE:
1608                         DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
1609                         pci_nvme_status_tc(&compl.status,
1610                             NVME_SCT_COMMAND_SPECIFIC,
1611                             NVME_SC_INVALID_FIRMWARE_SLOT);
1612                         break;
1613                 case NVME_OPC_ASYNC_EVENT_REQUEST:
1614                         DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1615                         nvme_opc_async_event_req(sc, cmd, &compl);
1616                         break;
1617                 case NVME_OPC_FORMAT_NVM:
1618                         DPRINTF("%s command FORMAT_NVM", __func__);
1619                         if ((sc->ctrldata.oacs &
1620                             (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1621                                 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1622                         }
1623                         compl.status = NVME_NO_STATUS;
1624                         nvme_opc_format_nvm(sc, cmd, &compl);
1625                         break;
1626                 default:
1627                         DPRINTF("0x%x command is not implemented",
1628                             cmd->opc);
1629                         pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1630                 }
1631                 sqhead = (sqhead + 1) % sq->size;
1632
1633                 if (NVME_COMPLETION_VALID(compl)) {
1634                         pci_nvme_cq_update(sc, &sc->compl_queues[0],
1635                             compl.cdw0,
1636                             cmd->cid,
1637                             0,          /* SQID */
1638                             compl.status);
1639                 }
1640         }
1641
1642         DPRINTF("setting sqhead %u", sqhead);
1643         sq->head = sqhead;
1644
1645         if (cq->head != cq->tail)
1646                 pci_generate_msix(sc->nsc_pi, 0);
1647
1648         pthread_mutex_unlock(&sq->mtx);
1649 }
1650
1651 /*
1652  * Update the Write and Read statistics reported in SMART data
1653  *
1654  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
1655  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
1656  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
1657  */
1658 static void
1659 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
1660     size_t bytes, uint16_t status)
1661 {
1662
1663         pthread_mutex_lock(&sc->mtx);
1664         switch (opc) {
1665         case NVME_OPC_WRITE:
1666                 sc->write_commands++;
1667                 if (status != NVME_SC_SUCCESS)
1668                         break;
1669                 sc->write_dunits_remainder += (bytes / 512);
1670                 while (sc->write_dunits_remainder >= 1000) {
1671                         sc->write_data_units++;
1672                         sc->write_dunits_remainder -= 1000;
1673                 }
1674                 break;
1675         case NVME_OPC_READ:
1676                 sc->read_commands++;
1677                 if (status != NVME_SC_SUCCESS)
1678                         break;
1679                 sc->read_dunits_remainder += (bytes / 512);
1680                 while (sc->read_dunits_remainder >= 1000) {
1681                         sc->read_data_units++;
1682                         sc->read_dunits_remainder -= 1000;
1683                 }
1684                 break;
1685         default:
1686                 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
1687                 break;
1688         }
1689         pthread_mutex_unlock(&sc->mtx);
1690 }
1691
1692 /*
1693  * Check if the combination of Starting LBA (slba) and Number of Logical
1694  * Blocks (nlb) exceeds the range of the underlying storage.
1695  *
1696  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
1697  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
1698  * overflow.
1699  */
1700 static bool
1701 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
1702     uint32_t nlb)
1703 {
1704         size_t  offset, bytes;
1705
1706         /* Overflow check of multiplying Starting LBA by the sector size */
1707         if (slba >> (64 - nvstore->sectsz_bits))
1708                 return (true);
1709
1710         offset = slba << nvstore->sectsz_bits;
1711         bytes = nlb << nvstore->sectsz_bits;
1712
1713         /* Overflow check of Number of Logical Blocks */
1714         if ((nvstore->size - offset) < bytes)
1715                 return (true);
1716
1717         return (false);
1718 }
1719
1720 static int
1721 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1722         uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1723 {
1724         int iovidx;
1725
1726         if (req == NULL)
1727                 return (-1);
1728
1729         if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
1730                 return (-1);
1731         }
1732
1733         /* concatenate contig block-iovs to minimize number of iovs */
1734         if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1735                 iovidx = req->io_req.br_iovcnt - 1;
1736
1737                 req->io_req.br_iov[iovidx].iov_base =
1738                     paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1739                                      req->prev_gpaddr, size);
1740
1741                 req->prev_size += size;
1742                 req->io_req.br_resid += size;
1743
1744                 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1745         } else {
1746                 iovidx = req->io_req.br_iovcnt;
1747                 if (iovidx == 0) {
1748                         req->io_req.br_offset = lba;
1749                         req->io_req.br_resid = 0;
1750                         req->io_req.br_param = req;
1751                 }
1752
1753                 req->io_req.br_iov[iovidx].iov_base =
1754                     paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1755                                      gpaddr, size);
1756
1757                 req->io_req.br_iov[iovidx].iov_len = size;
1758
1759                 req->prev_gpaddr = gpaddr;
1760                 req->prev_size = size;
1761                 req->io_req.br_resid += size;
1762
1763                 req->io_req.br_iovcnt++;
1764         }
1765
1766         return (0);
1767 }
1768
1769 static void
1770 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1771         struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1772         uint32_t cdw0, uint16_t status)
1773 {
1774         struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1775
1776         DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1777                  __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1778                  NVME_STATUS_GET_SC(status));
1779
1780         pci_nvme_cq_update(sc, cq,
1781             0,          /* CDW0 */
1782             cid,
1783             sqid,
1784             status);
1785
1786         if (cq->head != cq->tail) {
1787                 if (cq->intr_en & NVME_CQ_INTEN) {
1788                         pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1789                 } else {
1790                         DPRINTF("%s: CQ%u interrupt disabled",
1791                                                 __func__, sq->cqid);
1792                 }
1793         }
1794 }
1795
1796 static void
1797 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1798 {
1799         req->sc = NULL;
1800         req->nvme_sq = NULL;
1801         req->sqid = 0;
1802
1803         pthread_mutex_lock(&sc->mtx);
1804
1805         STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1806         sc->pending_ios--;
1807
1808         /* when no more IO pending, can set to ready if device reset/enabled */
1809         if (sc->pending_ios == 0 &&
1810             NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1811                 sc->regs.csts |= NVME_CSTS_RDY;
1812
1813         pthread_mutex_unlock(&sc->mtx);
1814
1815         sem_post(&sc->iosemlock);
1816 }
1817
1818 static struct pci_nvme_ioreq *
1819 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1820 {
1821         struct pci_nvme_ioreq *req = NULL;;
1822
1823         sem_wait(&sc->iosemlock);
1824         pthread_mutex_lock(&sc->mtx);
1825
1826         req = STAILQ_FIRST(&sc->ioreqs_free);
1827         assert(req != NULL);
1828         STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1829
1830         req->sc = sc;
1831
1832         sc->pending_ios++;
1833
1834         pthread_mutex_unlock(&sc->mtx);
1835
1836         req->io_req.br_iovcnt = 0;
1837         req->io_req.br_offset = 0;
1838         req->io_req.br_resid = 0;
1839         req->io_req.br_param = req;
1840         req->prev_gpaddr = 0;
1841         req->prev_size = 0;
1842
1843         return req;
1844 }
1845
1846 static void
1847 pci_nvme_io_done(struct blockif_req *br, int err)
1848 {
1849         struct pci_nvme_ioreq *req = br->br_param;
1850         struct nvme_submission_queue *sq = req->nvme_sq;
1851         uint16_t code, status;
1852
1853         DPRINTF("%s error %d %s", __func__, err, strerror(err));
1854
1855         /* TODO return correct error */
1856         code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1857         pci_nvme_status_genc(&status, code);
1858
1859         pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1860         pci_nvme_stats_write_read_update(req->sc, req->opc,
1861             req->bytes, status);
1862         pci_nvme_release_ioreq(req->sc, req);
1863 }
1864
1865 /*
1866  * Implements the Flush command. The specification states:
1867  *    If a volatile write cache is not present, Flush commands complete
1868  *    successfully and have no effect
1869  * in the description of the Volatile Write Cache (VWC) field of the Identify
1870  * Controller data. Therefore, set status to Success if the command is
1871  * not supported (i.e. RAM or as indicated by the blockif).
1872  */
1873 static bool
1874 nvme_opc_flush(struct pci_nvme_softc *sc,
1875     struct nvme_command *cmd,
1876     struct pci_nvme_blockstore *nvstore,
1877     struct pci_nvme_ioreq *req,
1878     uint16_t *status)
1879 {
1880         bool pending = false;
1881
1882         if (nvstore->type == NVME_STOR_RAM) {
1883                 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1884         } else {
1885                 int err;
1886
1887                 req->io_req.br_callback = pci_nvme_io_done;
1888
1889                 err = blockif_flush(nvstore->ctx, &req->io_req);
1890                 switch (err) {
1891                 case 0:
1892                         pending = true;
1893                         break;
1894                 case EOPNOTSUPP:
1895                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1896                         break;
1897                 default:
1898                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1899                 }
1900         }
1901
1902         return (pending);
1903 }
1904
1905 static uint16_t
1906 nvme_write_read_ram(struct pci_nvme_softc *sc,
1907     struct pci_nvme_blockstore *nvstore,
1908     uint64_t prp1, uint64_t prp2,
1909     size_t offset, uint64_t bytes,
1910     bool is_write)
1911 {
1912         uint8_t *buf = nvstore->ctx;
1913         enum nvme_copy_dir dir;
1914         uint16_t status;
1915
1916         if (is_write)
1917                 dir = NVME_COPY_TO_PRP;
1918         else
1919                 dir = NVME_COPY_FROM_PRP;
1920
1921         if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
1922             buf + offset, bytes, dir))
1923                 pci_nvme_status_genc(&status,
1924                     NVME_SC_DATA_TRANSFER_ERROR);
1925         else
1926                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1927
1928         return (status);
1929 }
1930
1931 static uint16_t
1932 nvme_write_read_blockif(struct pci_nvme_softc *sc,
1933     struct pci_nvme_blockstore *nvstore,
1934     struct pci_nvme_ioreq *req,
1935     uint64_t prp1, uint64_t prp2,
1936     size_t offset, uint64_t bytes,
1937     bool is_write)
1938 {
1939         uint64_t size;
1940         int err;
1941         uint16_t status = NVME_NO_STATUS;
1942
1943         size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
1944         if (pci_nvme_append_iov_req(sc, req, prp1,
1945             size, is_write, offset)) {
1946                 pci_nvme_status_genc(&status,
1947                     NVME_SC_DATA_TRANSFER_ERROR);
1948                 goto out;
1949         }
1950
1951         offset += size;
1952         bytes  -= size;
1953
1954         if (bytes == 0) {
1955                 ;
1956         } else if (bytes <= PAGE_SIZE) {
1957                 size = bytes;
1958                 if (pci_nvme_append_iov_req(sc, req, prp2,
1959                     size, is_write, offset)) {
1960                         pci_nvme_status_genc(&status,
1961                             NVME_SC_DATA_TRANSFER_ERROR);
1962                         goto out;
1963                 }
1964         } else {
1965                 void *vmctx = sc->nsc_pi->pi_vmctx;
1966                 uint64_t *prp_list = &prp2;
1967                 uint64_t *last = prp_list;
1968
1969                 /* PRP2 is pointer to a physical region page list */
1970                 while (bytes) {
1971                         /* Last entry in list points to the next list */
1972                         if (prp_list == last) {
1973                                 uint64_t prp = *prp_list;
1974
1975                                 prp_list = paddr_guest2host(vmctx, prp,
1976                                     PAGE_SIZE - (prp % PAGE_SIZE));
1977                                 last = prp_list + (NVME_PRP2_ITEMS - 1);
1978                         }
1979
1980                         size = MIN(bytes, PAGE_SIZE);
1981
1982                         if (pci_nvme_append_iov_req(sc, req, *prp_list,
1983                             size, is_write, offset)) {
1984                                 pci_nvme_status_genc(&status,
1985                                     NVME_SC_DATA_TRANSFER_ERROR);
1986                                 goto out;
1987                         }
1988
1989                         offset += size;
1990                         bytes  -= size;
1991
1992                         prp_list++;
1993                 }
1994         }
1995         req->io_req.br_callback = pci_nvme_io_done;
1996         if (is_write)
1997                 err = blockif_write(nvstore->ctx, &req->io_req);
1998         else
1999                 err = blockif_read(nvstore->ctx, &req->io_req);
2000
2001         if (err)
2002                 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2003 out:
2004         return (status);
2005 }
2006
2007 static bool
2008 nvme_opc_write_read(struct pci_nvme_softc *sc,
2009     struct nvme_command *cmd,
2010     struct pci_nvme_blockstore *nvstore,
2011     struct pci_nvme_ioreq *req,
2012     uint16_t *status)
2013 {
2014         uint64_t lba, nblocks, bytes;
2015         size_t offset;
2016         bool is_write = cmd->opc == NVME_OPC_WRITE;
2017         bool pending = false;
2018
2019         lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2020         nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2021         if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2022                 WPRINTF("%s command would exceed LBA range", __func__);
2023                 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2024                 goto out;
2025         }
2026
2027         bytes  = nblocks << nvstore->sectsz_bits;
2028         if (bytes > NVME_MAX_DATA_SIZE) {
2029                 WPRINTF("%s command would exceed MDTS", __func__);
2030                 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2031                 goto out;
2032         }
2033
2034         offset = lba << nvstore->sectsz_bits;
2035
2036         req->bytes = bytes;
2037         req->io_req.br_offset = lba;
2038
2039         /* PRP bits 1:0 must be zero */
2040         cmd->prp1 &= ~0x3UL;
2041         cmd->prp2 &= ~0x3UL;
2042
2043         if (nvstore->type == NVME_STOR_RAM) {
2044                 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2045                     cmd->prp2, offset, bytes, is_write);
2046         } else {
2047                 *status = nvme_write_read_blockif(sc, nvstore, req,
2048                     cmd->prp1, cmd->prp2, offset, bytes, is_write);
2049
2050                 if (*status == NVME_NO_STATUS)
2051                         pending = true;
2052         }
2053 out:
2054         if (!pending)
2055                 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2056
2057         return (pending);
2058 }
2059
2060 static void
2061 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2062 {
2063         struct pci_nvme_ioreq *req = br->br_param;
2064         struct pci_nvme_softc *sc = req->sc;
2065         bool done = true;
2066         uint16_t status;
2067
2068         if (err) {
2069                 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2070         } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2071                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2072         } else {
2073                 struct iovec *iov = req->io_req.br_iov;
2074
2075                 req->prev_gpaddr++;
2076                 iov += req->prev_gpaddr;
2077
2078                 /* The iov_* values already include the sector size */
2079                 req->io_req.br_offset = (off_t)iov->iov_base;
2080                 req->io_req.br_resid = iov->iov_len;
2081                 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2082                         pci_nvme_status_genc(&status,
2083                             NVME_SC_INTERNAL_DEVICE_ERROR);
2084                 } else
2085                         done = false;
2086         }
2087
2088         if (done) {
2089                 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2090                     req->cid, 0, status);
2091                 pci_nvme_release_ioreq(sc, req);
2092         }
2093 }
2094
2095 static bool
2096 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2097     struct nvme_command *cmd,
2098     struct pci_nvme_blockstore *nvstore,
2099     struct pci_nvme_ioreq *req,
2100     uint16_t *status)
2101 {
2102         struct nvme_dsm_range *range;
2103         uint32_t nr, r, non_zero, dr;
2104         int err;
2105         bool pending = false;
2106
2107         if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2108                 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2109                 goto out;
2110         }
2111
2112         nr = cmd->cdw10 & 0xff;
2113
2114         /* copy locally because a range entry could straddle PRPs */
2115         range = calloc(1, NVME_MAX_DSM_TRIM);
2116         if (range == NULL) {
2117                 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2118                 goto out;
2119         }
2120         nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2121             (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2122
2123         /* Check for invalid ranges and the number of non-zero lengths */
2124         non_zero = 0;
2125         for (r = 0; r <= nr; r++) {
2126                 if (pci_nvme_out_of_range(nvstore,
2127                     range[r].starting_lba, range[r].length)) {
2128                         pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2129                         goto out;
2130                 }
2131                 if (range[r].length != 0)
2132                         non_zero++;
2133         }
2134
2135         if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2136                 size_t offset, bytes;
2137                 int sectsz_bits = sc->nvstore.sectsz_bits;
2138
2139                 /*
2140                  * DSM calls are advisory only, and compliant controllers
2141                  * may choose to take no actions (i.e. return Success).
2142                  */
2143                 if (!nvstore->deallocate) {
2144                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2145                         goto out;
2146                 }
2147
2148                 /* If all ranges have a zero length, return Success */
2149                 if (non_zero == 0) {
2150                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2151                         goto out;
2152                 }
2153
2154                 if (req == NULL) {
2155                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2156                         goto out;
2157                 }
2158
2159                 offset = range[0].starting_lba << sectsz_bits;
2160                 bytes = range[0].length << sectsz_bits;
2161
2162                 /*
2163                  * If the request is for more than a single range, store
2164                  * the ranges in the br_iov. Optimize for the common case
2165                  * of a single range.
2166                  *
2167                  * Note that NVMe Number of Ranges is a zero based value
2168                  */
2169                 req->io_req.br_iovcnt = 0;
2170                 req->io_req.br_offset = offset;
2171                 req->io_req.br_resid = bytes;
2172
2173                 if (nr == 0) {
2174                         req->io_req.br_callback = pci_nvme_io_done;
2175                 } else {
2176                         struct iovec *iov = req->io_req.br_iov;
2177
2178                         for (r = 0, dr = 0; r <= nr; r++) {
2179                                 offset = range[r].starting_lba << sectsz_bits;
2180                                 bytes = range[r].length << sectsz_bits;
2181                                 if (bytes == 0)
2182                                         continue;
2183
2184                                 if ((nvstore->size - offset) < bytes) {
2185                                         pci_nvme_status_genc(status,
2186                                             NVME_SC_LBA_OUT_OF_RANGE);
2187                                         goto out;
2188                                 }
2189                                 iov[dr].iov_base = (void *)offset;
2190                                 iov[dr].iov_len = bytes;
2191                                 dr++;
2192                         }
2193                         req->io_req.br_callback = pci_nvme_dealloc_sm;
2194
2195                         /*
2196                          * Use prev_gpaddr to track the current entry and
2197                          * prev_size to track the number of entries
2198                          */
2199                         req->prev_gpaddr = 0;
2200                         req->prev_size = dr;
2201                 }
2202
2203                 err = blockif_delete(nvstore->ctx, &req->io_req);
2204                 if (err)
2205                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2206                 else
2207                         pending = true;
2208         }
2209 out:
2210         free(range);
2211         return (pending);
2212 }
2213
2214 static void
2215 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2216 {
2217         struct nvme_submission_queue *sq;
2218         uint16_t status;
2219         uint16_t sqhead;
2220
2221         /* handle all submissions up to sq->tail index */
2222         sq = &sc->submit_queues[idx];
2223
2224         pthread_mutex_lock(&sq->mtx);
2225
2226         sqhead = sq->head;
2227         DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2228                  idx, sqhead, sq->tail, sq->qbase);
2229
2230         while (sqhead != atomic_load_acq_short(&sq->tail)) {
2231                 struct nvme_command *cmd;
2232                 struct pci_nvme_ioreq *req;
2233                 uint32_t nsid;
2234                 bool pending;
2235
2236                 pending = false;
2237                 req = NULL;
2238                 status = 0;
2239
2240                 cmd = &sq->qbase[sqhead];
2241                 sqhead = (sqhead + 1) % sq->size;
2242
2243                 nsid = le32toh(cmd->nsid);
2244                 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2245                         pci_nvme_status_genc(&status,
2246                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2247                         status |=
2248                             NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2249                         goto complete;
2250                 }
2251
2252                 req = pci_nvme_get_ioreq(sc);
2253                 if (req == NULL) {
2254                         pci_nvme_status_genc(&status,
2255                             NVME_SC_INTERNAL_DEVICE_ERROR);
2256                         WPRINTF("%s: unable to allocate IO req", __func__);
2257                         goto complete;
2258                 }
2259                 req->nvme_sq = sq;
2260                 req->sqid = idx;
2261                 req->opc = cmd->opc;
2262                 req->cid = cmd->cid;
2263                 req->nsid = cmd->nsid;
2264
2265                 switch (cmd->opc) {
2266                 case NVME_OPC_FLUSH:
2267                         pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2268                             req, &status);
2269                         break;
2270                 case NVME_OPC_WRITE:
2271                 case NVME_OPC_READ:
2272                         pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2273                             req, &status);
2274                         break;
2275                 case NVME_OPC_WRITE_ZEROES:
2276                         /* TODO: write zeroes
2277                         WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2278                                 __func__, lba, cmd->cdw12 & 0xFFFF); */
2279                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2280                         break;
2281                 case NVME_OPC_DATASET_MANAGEMENT:
2282                         pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2283                             req, &status);
2284                         break;
2285                 default:
2286                         WPRINTF("%s unhandled io command 0x%x",
2287                             __func__, cmd->opc);
2288                         pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2289                 }
2290 complete:
2291                 if (!pending) {
2292                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2293                             status);
2294                         if (req != NULL)
2295                                 pci_nvme_release_ioreq(sc, req);
2296                 }
2297         }
2298
2299         sq->head = sqhead;
2300
2301         pthread_mutex_unlock(&sq->mtx);
2302 }
2303
2304 static void
2305 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2306         uint64_t idx, int is_sq, uint64_t value)
2307 {
2308         DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2309                 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2310
2311         if (is_sq) {
2312                 if (idx > sc->num_squeues) {
2313                         WPRINTF("%s queue index %lu overflow from "
2314                                  "guest (max %u)",
2315                                  __func__, idx, sc->num_squeues);
2316                         return;
2317                 }
2318
2319                 atomic_store_short(&sc->submit_queues[idx].tail,
2320                                    (uint16_t)value);
2321
2322                 if (idx == 0) {
2323                         pci_nvme_handle_admin_cmd(sc, value);
2324                 } else {
2325                         /* submission queue; handle new entries in SQ */
2326                         if (idx > sc->num_squeues) {
2327                                 WPRINTF("%s SQ index %lu overflow from "
2328                                          "guest (max %u)",
2329                                          __func__, idx, sc->num_squeues);
2330                                 return;
2331                         }
2332                         pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2333                 }
2334         } else {
2335                 if (idx > sc->num_cqueues) {
2336                         WPRINTF("%s queue index %lu overflow from "
2337                                  "guest (max %u)",
2338                                  __func__, idx, sc->num_cqueues);
2339                         return;
2340                 }
2341
2342                 atomic_store_short(&sc->compl_queues[idx].head,
2343                                 (uint16_t)value);
2344         }
2345 }
2346
2347 static void
2348 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2349 {
2350         const char *s = iswrite ? "WRITE" : "READ";
2351
2352         switch (offset) {
2353         case NVME_CR_CAP_LOW:
2354                 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2355                 break;
2356         case NVME_CR_CAP_HI:
2357                 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2358                 break;
2359         case NVME_CR_VS:
2360                 DPRINTF("%s %s NVME_CR_VS", func, s);
2361                 break;
2362         case NVME_CR_INTMS:
2363                 DPRINTF("%s %s NVME_CR_INTMS", func, s);
2364                 break;
2365         case NVME_CR_INTMC:
2366                 DPRINTF("%s %s NVME_CR_INTMC", func, s);
2367                 break;
2368         case NVME_CR_CC:
2369                 DPRINTF("%s %s NVME_CR_CC", func, s);
2370                 break;
2371         case NVME_CR_CSTS:
2372                 DPRINTF("%s %s NVME_CR_CSTS", func, s);
2373                 break;
2374         case NVME_CR_NSSR:
2375                 DPRINTF("%s %s NVME_CR_NSSR", func, s);
2376                 break;
2377         case NVME_CR_AQA:
2378                 DPRINTF("%s %s NVME_CR_AQA", func, s);
2379                 break;
2380         case NVME_CR_ASQ_LOW:
2381                 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2382                 break;
2383         case NVME_CR_ASQ_HI:
2384                 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2385                 break;
2386         case NVME_CR_ACQ_LOW:
2387                 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2388                 break;
2389         case NVME_CR_ACQ_HI:
2390                 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2391                 break;
2392         default:
2393                 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2394         }
2395
2396 }
2397
2398 static void
2399 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2400         uint64_t offset, int size, uint64_t value)
2401 {
2402         uint32_t ccreg;
2403
2404         if (offset >= NVME_DOORBELL_OFFSET) {
2405                 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2406                 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2407                 int is_sq = (belloffset % 8) < 4;
2408
2409                 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2410                         WPRINTF("guest attempted an overflow write offset "
2411                                  "0x%lx, val 0x%lx in %s",
2412                                  offset, value, __func__);
2413                         return;
2414                 }
2415
2416                 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2417                 return;
2418         }
2419
2420         DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2421                 offset, size, value);
2422
2423         if (size != 4) {
2424                 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2425                          "val 0x%lx) to bar0 in %s",
2426                          size, offset, value, __func__);
2427                 /* TODO: shutdown device */
2428                 return;
2429         }
2430
2431         pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2432
2433         pthread_mutex_lock(&sc->mtx);
2434
2435         switch (offset) {
2436         case NVME_CR_CAP_LOW:
2437         case NVME_CR_CAP_HI:
2438                 /* readonly */
2439                 break;
2440         case NVME_CR_VS:
2441                 /* readonly */
2442                 break;
2443         case NVME_CR_INTMS:
2444                 /* MSI-X, so ignore */
2445                 break;
2446         case NVME_CR_INTMC:
2447                 /* MSI-X, so ignore */
2448                 break;
2449         case NVME_CR_CC:
2450                 ccreg = (uint32_t)value;
2451
2452                 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2453                          "iocqes %u",
2454                         __func__,
2455                          NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2456                          NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2457                          NVME_CC_GET_IOCQES(ccreg));
2458
2459                 if (NVME_CC_GET_SHN(ccreg)) {
2460                         /* perform shutdown - flush out data to backend */
2461                         sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2462                             NVME_CSTS_REG_SHST_SHIFT);
2463                         sc->regs.csts |= NVME_SHST_COMPLETE <<
2464                             NVME_CSTS_REG_SHST_SHIFT;
2465                 }
2466                 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2467                         if (NVME_CC_GET_EN(ccreg) == 0)
2468                                 /* transition 1-> causes controller reset */
2469                                 pci_nvme_reset_locked(sc);
2470                         else
2471                                 pci_nvme_init_controller(ctx, sc);
2472                 }
2473
2474                 /* Insert the iocqes, iosqes and en bits from the write */
2475                 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2476                 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2477                 if (NVME_CC_GET_EN(ccreg) == 0) {
2478                         /* Insert the ams, mps and css bit fields */
2479                         sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2480                         sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2481                         sc->regs.csts &= ~NVME_CSTS_RDY;
2482                 } else if (sc->pending_ios == 0) {
2483                         sc->regs.csts |= NVME_CSTS_RDY;
2484                 }
2485                 break;
2486         case NVME_CR_CSTS:
2487                 break;
2488         case NVME_CR_NSSR:
2489                 /* ignore writes; don't support subsystem reset */
2490                 break;
2491         case NVME_CR_AQA:
2492                 sc->regs.aqa = (uint32_t)value;
2493                 break;
2494         case NVME_CR_ASQ_LOW:
2495                 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2496                                (0xFFFFF000 & value);
2497                 break;
2498         case NVME_CR_ASQ_HI:
2499                 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2500                                (value << 32);
2501                 break;
2502         case NVME_CR_ACQ_LOW:
2503                 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2504                                (0xFFFFF000 & value);
2505                 break;
2506         case NVME_CR_ACQ_HI:
2507                 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2508                                (value << 32);
2509                 break;
2510         default:
2511                 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2512                          __func__, offset, value, size);
2513         }
2514         pthread_mutex_unlock(&sc->mtx);
2515 }
2516
2517 static void
2518 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2519                 int baridx, uint64_t offset, int size, uint64_t value)
2520 {
2521         struct pci_nvme_softc* sc = pi->pi_arg;
2522
2523         if (baridx == pci_msix_table_bar(pi) ||
2524             baridx == pci_msix_pba_bar(pi)) {
2525                 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2526                          " value 0x%lx", baridx, offset, size, value);
2527
2528                 pci_emul_msix_twrite(pi, offset, size, value);
2529                 return;
2530         }
2531
2532         switch (baridx) {
2533         case 0:
2534                 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2535                 break;
2536
2537         default:
2538                 DPRINTF("%s unknown baridx %d, val 0x%lx",
2539                          __func__, baridx, value);
2540         }
2541 }
2542
2543 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2544         uint64_t offset, int size)
2545 {
2546         uint64_t value;
2547
2548         pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2549
2550         if (offset < NVME_DOORBELL_OFFSET) {
2551                 void *p = &(sc->regs);
2552                 pthread_mutex_lock(&sc->mtx);
2553                 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2554                 pthread_mutex_unlock(&sc->mtx);
2555         } else {
2556                 value = 0;
2557                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2558         }
2559
2560         switch (size) {
2561         case 1:
2562                 value &= 0xFF;
2563                 break;
2564         case 2:
2565                 value &= 0xFFFF;
2566                 break;
2567         case 4:
2568                 value &= 0xFFFFFFFF;
2569                 break;
2570         }
2571
2572         DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
2573                  offset, size, (uint32_t)value);
2574
2575         return (value);
2576 }
2577
2578
2579
2580 static uint64_t
2581 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2582     uint64_t offset, int size)
2583 {
2584         struct pci_nvme_softc* sc = pi->pi_arg;
2585
2586         if (baridx == pci_msix_table_bar(pi) ||
2587             baridx == pci_msix_pba_bar(pi)) {
2588                 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2589                         baridx, offset, size);
2590
2591                 return pci_emul_msix_tread(pi, offset, size);
2592         }
2593
2594         switch (baridx) {
2595         case 0:
2596                 return pci_nvme_read_bar_0(sc, offset, size);
2597
2598         default:
2599                 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2600         }
2601
2602         return (0);
2603 }
2604
2605
2606 static int
2607 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2608 {
2609         char bident[sizeof("XX:X:X")];
2610         char    *uopt, *xopts, *config;
2611         uint32_t sectsz;
2612         int optidx;
2613
2614         sc->max_queues = NVME_QUEUES;
2615         sc->max_qentries = NVME_MAX_QENTRIES;
2616         sc->ioslots = NVME_IOSLOTS;
2617         sc->num_squeues = sc->max_queues;
2618         sc->num_cqueues = sc->max_queues;
2619         sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2620         sectsz = 0;
2621
2622         uopt = strdup(opts);
2623         optidx = 0;
2624         snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2625                  "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2626         for (xopts = strtok(uopt, ",");
2627              xopts != NULL;
2628              xopts = strtok(NULL, ",")) {
2629
2630                 if ((config = strchr(xopts, '=')) != NULL)
2631                         *config++ = '\0';
2632
2633                 if (!strcmp("maxq", xopts)) {
2634                         sc->max_queues = atoi(config);
2635                 } else if (!strcmp("qsz", xopts)) {
2636                         sc->max_qentries = atoi(config);
2637                 } else if (!strcmp("ioslots", xopts)) {
2638                         sc->ioslots = atoi(config);
2639                 } else if (!strcmp("sectsz", xopts)) {
2640                         sectsz = atoi(config);
2641                 } else if (!strcmp("ser", xopts)) {
2642                         /*
2643                          * This field indicates the Product Serial Number in
2644                          * 7-bit ASCII, unused bytes should be space characters.
2645                          * Ref: NVMe v1.3c.
2646                          */
2647                         cpywithpad((char *)sc->ctrldata.sn,
2648                                    sizeof(sc->ctrldata.sn), config, ' ');
2649                 } else if (!strcmp("ram", xopts)) {
2650                         uint64_t sz = strtoull(&xopts[4], NULL, 10);
2651
2652                         sc->nvstore.type = NVME_STOR_RAM;
2653                         sc->nvstore.size = sz * 1024 * 1024;
2654                         sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2655                         sc->nvstore.sectsz = 4096;
2656                         sc->nvstore.sectsz_bits = 12;
2657                         if (sc->nvstore.ctx == NULL) {
2658                                 perror("Unable to allocate RAM");
2659                                 free(uopt);
2660                                 return (-1);
2661                         }
2662                 } else if (!strcmp("eui64", xopts)) {
2663                         sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2664                 } else if (!strcmp("dsm", xopts)) {
2665                         if (!strcmp("auto", config))
2666                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2667                         else if (!strcmp("enable", config))
2668                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2669                         else if (!strcmp("disable", config))
2670                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2671                 } else if (optidx == 0) {
2672                         snprintf(bident, sizeof(bident), "%d:%d",
2673                                  sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2674                         sc->nvstore.ctx = blockif_open(xopts, bident);
2675                         if (sc->nvstore.ctx == NULL) {
2676                                 perror("Could not open backing file");
2677                                 free(uopt);
2678                                 return (-1);
2679                         }
2680                         sc->nvstore.type = NVME_STOR_BLOCKIF;
2681                         sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2682                 } else {
2683                         EPRINTLN("Invalid option %s", xopts);
2684                         free(uopt);
2685                         return (-1);
2686                 }
2687
2688                 optidx++;
2689         }
2690         free(uopt);
2691
2692         if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2693                 EPRINTLN("backing store not specified");
2694                 return (-1);
2695         }
2696         if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2697                 sc->nvstore.sectsz = sectsz;
2698         else if (sc->nvstore.type != NVME_STOR_RAM)
2699                 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2700         for (sc->nvstore.sectsz_bits = 9;
2701              (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2702              sc->nvstore.sectsz_bits++);
2703
2704         if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2705                 sc->max_queues = NVME_QUEUES;
2706
2707         if (sc->max_qentries <= 0) {
2708                 EPRINTLN("Invalid qsz option");
2709                 return (-1);
2710         }
2711         if (sc->ioslots <= 0) {
2712                 EPRINTLN("Invalid ioslots option");
2713                 return (-1);
2714         }
2715
2716         return (0);
2717 }
2718
2719 static int
2720 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2721 {
2722         struct pci_nvme_softc *sc;
2723         uint32_t pci_membar_sz;
2724         int     error;
2725
2726         error = 0;
2727
2728         sc = calloc(1, sizeof(struct pci_nvme_softc));
2729         pi->pi_arg = sc;
2730         sc->nsc_pi = pi;
2731
2732         error = pci_nvme_parse_opts(sc, opts);
2733         if (error < 0)
2734                 goto done;
2735         else
2736                 error = 0;
2737
2738         STAILQ_INIT(&sc->ioreqs_free);
2739         sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2740         for (int i = 0; i < sc->ioslots; i++) {
2741                 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2742         }
2743
2744         pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2745         pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2746         pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2747         pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2748         pci_set_cfgdata8(pi, PCIR_PROGIF,
2749                          PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2750
2751         /*
2752          * Allocate size of NVMe registers + doorbell space for all queues.
2753          *
2754          * The specification requires a minimum memory I/O window size of 16K.
2755          * The Windows driver will refuse to start a device with a smaller
2756          * window.
2757          */
2758         pci_membar_sz = sizeof(struct nvme_registers) +
2759             2 * sizeof(uint32_t) * (sc->max_queues + 1);
2760         pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2761
2762         DPRINTF("nvme membar size: %u", pci_membar_sz);
2763
2764         error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2765         if (error) {
2766                 WPRINTF("%s pci alloc mem bar failed", __func__);
2767                 goto done;
2768         }
2769
2770         error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2771         if (error) {
2772                 WPRINTF("%s pci add msixcap failed", __func__);
2773                 goto done;
2774         }
2775
2776         error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2777         if (error) {
2778                 WPRINTF("%s pci add Express capability failed", __func__);
2779                 goto done;
2780         }
2781
2782         pthread_mutex_init(&sc->mtx, NULL);
2783         sem_init(&sc->iosemlock, 0, sc->ioslots);
2784
2785         pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2786         /*
2787          * Controller data depends on Namespace data so initialize Namespace
2788          * data first.
2789          */
2790         pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2791         pci_nvme_init_ctrldata(sc);
2792         pci_nvme_init_logpages(sc);
2793         pci_nvme_init_features(sc);
2794
2795         pci_nvme_aer_init(sc);
2796
2797         pci_nvme_reset(sc);
2798
2799         pci_lintr_request(pi);
2800
2801 done:
2802         return (error);
2803 }
2804
2805
2806 struct pci_devemu pci_de_nvme = {
2807         .pe_emu =       "nvme",
2808         .pe_init =      pci_nvme_init,
2809         .pe_barwrite =  pci_nvme_write,
2810         .pe_barread =   pci_nvme_read
2811 };
2812 PCI_EMUL_SET(pci_de_nvme);