]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/bhyve/pci_nvme.c
bhyve: NVMe handle zero length DSM ranges
[FreeBSD/FreeBSD.git] / usr.sbin / bhyve / pci_nvme.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov 
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65
66 #include <assert.h>
67 #include <pthread.h>
68 #include <semaphore.h>
69 #include <stdbool.h>
70 #include <stddef.h>
71 #include <stdint.h>
72 #include <stdio.h>
73 #include <stdlib.h>
74 #include <string.h>
75
76 #include <machine/atomic.h>
77 #include <machine/vmm.h>
78 #include <vmmapi.h>
79
80 #include <dev/nvme/nvme.h>
81
82 #include "bhyverun.h"
83 #include "block_if.h"
84 #include "debug.h"
85 #include "pci_emul.h"
86
87
88 static int nvme_debug = 0;
89 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
90 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
91
92 /* defaults; can be overridden */
93 #define NVME_MSIX_BAR           4
94
95 #define NVME_IOSLOTS            8
96
97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
98 #define NVME_MMIO_SPACE_MIN     (1 << 14)
99
100 #define NVME_QUEUES             16
101 #define NVME_MAX_QENTRIES       2048
102 /* Memory Page size Minimum reported in CAP register */
103 #define NVME_MPSMIN             0
104 /* MPSMIN converted to bytes */
105 #define NVME_MPSMIN_BYTES       (1 << (12 + NVME_MPSMIN))
106
107 #define NVME_PRP2_ITEMS         (PAGE_SIZE/sizeof(uint64_t))
108 #define NVME_MDTS               9
109 /* Note the + 1 allows for the initial descriptor to not be page aligned */
110 #define NVME_MAX_IOVEC          ((1 << NVME_MDTS) + 1)
111 #define NVME_MAX_DATA_SIZE      ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
112
113 /* This is a synthetic status code to indicate there is no status */
114 #define NVME_NO_STATUS          0xffff
115 #define NVME_COMPLETION_VALID(c)        ((c).status != NVME_NO_STATUS)
116
117 /* helpers */
118
119 /* Convert a zero-based value into a one-based value */
120 #define ONE_BASED(zero)         ((zero) + 1)
121 /* Convert a one-based value into a zero-based value */
122 #define ZERO_BASED(one)         ((one)  - 1)
123
124 /* Encode number of SQ's and CQ's for Set/Get Features */
125 #define NVME_FEATURE_NUM_QUEUES(sc) \
126         (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
127         (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
128
129 #define NVME_DOORBELL_OFFSET    offsetof(struct nvme_registers, doorbell)
130
131 enum nvme_controller_register_offsets {
132         NVME_CR_CAP_LOW = 0x00,
133         NVME_CR_CAP_HI  = 0x04,
134         NVME_CR_VS      = 0x08,
135         NVME_CR_INTMS   = 0x0c,
136         NVME_CR_INTMC   = 0x10,
137         NVME_CR_CC      = 0x14,
138         NVME_CR_CSTS    = 0x1c,
139         NVME_CR_NSSR    = 0x20,
140         NVME_CR_AQA     = 0x24,
141         NVME_CR_ASQ_LOW = 0x28,
142         NVME_CR_ASQ_HI  = 0x2c,
143         NVME_CR_ACQ_LOW = 0x30,
144         NVME_CR_ACQ_HI  = 0x34,
145 };
146
147 enum nvme_cmd_cdw11 {
148         NVME_CMD_CDW11_PC  = 0x0001,
149         NVME_CMD_CDW11_IEN = 0x0002,
150         NVME_CMD_CDW11_IV  = 0xFFFF0000,
151 };
152
153 enum nvme_copy_dir {
154         NVME_COPY_TO_PRP,
155         NVME_COPY_FROM_PRP,
156 };
157
158 #define NVME_CQ_INTEN   0x01
159 #define NVME_CQ_INTCOAL 0x02
160
161 struct nvme_completion_queue {
162         struct nvme_completion *qbase;
163         pthread_mutex_t mtx;
164         uint32_t        size;
165         uint16_t        tail; /* nvme progress */
166         uint16_t        head; /* guest progress */
167         uint16_t        intr_vec;
168         uint32_t        intr_en;
169 };
170
171 struct nvme_submission_queue {
172         struct nvme_command *qbase;
173         pthread_mutex_t mtx;
174         uint32_t        size;
175         uint16_t        head; /* nvme progress */
176         uint16_t        tail; /* guest progress */
177         uint16_t        cqid; /* completion queue id */
178         int             qpriority;
179 };
180
181 enum nvme_storage_type {
182         NVME_STOR_BLOCKIF = 0,
183         NVME_STOR_RAM = 1,
184 };
185
186 struct pci_nvme_blockstore {
187         enum nvme_storage_type type;
188         void            *ctx;
189         uint64_t        size;
190         uint32_t        sectsz;
191         uint32_t        sectsz_bits;
192         uint64_t        eui64;
193         uint32_t        deallocate:1;
194 };
195
196 /*
197  * Calculate the number of additional page descriptors for guest IO requests
198  * based on the advertised Max Data Transfer (MDTS) and given the number of
199  * default iovec's in a struct blockif_req.
200  *
201  * Note the + 1 allows for the initial descriptor to not be page aligned.
202  */
203 #define MDTS_PAD_SIZE \
204         NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
205         NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
206         0
207
208 struct pci_nvme_ioreq {
209         struct pci_nvme_softc *sc;
210         STAILQ_ENTRY(pci_nvme_ioreq) link;
211         struct nvme_submission_queue *nvme_sq;
212         uint16_t        sqid;
213
214         /* command information */
215         uint16_t        opc;
216         uint16_t        cid;
217         uint32_t        nsid;
218
219         uint64_t        prev_gpaddr;
220         size_t          prev_size;
221         size_t          bytes;
222
223         struct blockif_req io_req;
224
225         struct iovec    iovpadding[MDTS_PAD_SIZE];
226 };
227
228 enum nvme_dsm_type {
229         /* Dataset Management bit in ONCS reflects backing storage capability */
230         NVME_DATASET_MANAGEMENT_AUTO,
231         /* Unconditionally set Dataset Management bit in ONCS */
232         NVME_DATASET_MANAGEMENT_ENABLE,
233         /* Unconditionally clear Dataset Management bit in ONCS */
234         NVME_DATASET_MANAGEMENT_DISABLE,
235 };
236
237 struct pci_nvme_softc;
238 struct nvme_feature_obj;
239
240 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
241     struct nvme_feature_obj *,
242     struct nvme_command *,
243     struct nvme_completion *);
244
245 struct nvme_feature_obj {
246         uint32_t        cdw11;
247         nvme_feature_cb set;
248         nvme_feature_cb get;
249         bool namespace_specific;
250 };
251
252 #define NVME_FID_MAX            (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
253
254 struct pci_nvme_aer {
255         STAILQ_ENTRY(pci_nvme_aer) link;
256         uint16_t        cid;    /* Command ID of the submitted AER */
257 };
258
259 struct pci_nvme_softc {
260         struct pci_devinst *nsc_pi;
261
262         pthread_mutex_t mtx;
263
264         struct nvme_registers regs;
265
266         struct nvme_namespace_data  nsdata;
267         struct nvme_controller_data ctrldata;
268         struct nvme_error_information_entry err_log;
269         struct nvme_health_information_page health_log;
270         struct nvme_firmware_page fw_log;
271
272         struct pci_nvme_blockstore nvstore;
273
274         uint16_t        max_qentries;   /* max entries per queue */
275         uint32_t        max_queues;     /* max number of IO SQ's or CQ's */
276         uint32_t        num_cqueues;
277         uint32_t        num_squeues;
278         bool            num_q_is_set; /* Has host set Number of Queues */
279
280         struct pci_nvme_ioreq *ioreqs;
281         STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
282         uint32_t        pending_ios;
283         uint32_t        ioslots;
284         sem_t           iosemlock;
285
286         /*
287          * Memory mapped Submission and Completion queues
288          * Each array includes both Admin and IO queues
289          */
290         struct nvme_completion_queue *compl_queues;
291         struct nvme_submission_queue *submit_queues;
292
293         struct nvme_feature_obj feat[NVME_FID_MAX];
294
295         enum nvme_dsm_type dataset_management;
296
297         /* Accounting for SMART data */
298         __uint128_t     read_data_units;
299         __uint128_t     write_data_units;
300         __uint128_t     read_commands;
301         __uint128_t     write_commands;
302         uint32_t        read_dunits_remainder;
303         uint32_t        write_dunits_remainder;
304
305         STAILQ_HEAD(, pci_nvme_aer) aer_list;
306         uint32_t        aer_count;
307 };
308
309
310 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
311 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
312 static void pci_nvme_io_done(struct blockif_req *, int);
313
314 /* Controller Configuration utils */
315 #define NVME_CC_GET_EN(cc) \
316         ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
317 #define NVME_CC_GET_CSS(cc) \
318         ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
319 #define NVME_CC_GET_SHN(cc) \
320         ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
321 #define NVME_CC_GET_IOSQES(cc) \
322         ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
323 #define NVME_CC_GET_IOCQES(cc) \
324         ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
325
326 #define NVME_CC_WRITE_MASK \
327         ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
328          (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
329          (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
330
331 #define NVME_CC_NEN_WRITE_MASK \
332         ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
333          (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
334          (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
335
336 /* Controller Status utils */
337 #define NVME_CSTS_GET_RDY(sts) \
338         ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
339
340 #define NVME_CSTS_RDY   (1 << NVME_CSTS_REG_RDY_SHIFT)
341
342 /* Completion Queue status word utils */
343 #define NVME_STATUS_P   (1 << NVME_STATUS_P_SHIFT)
344 #define NVME_STATUS_MASK \
345         ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
346          (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
347
348 #define NVME_ONCS_DSM   (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
349         NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
350
351 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
352     struct nvme_feature_obj *,
353     struct nvme_command *,
354     struct nvme_completion *);
355 static void nvme_feature_num_queues(struct pci_nvme_softc *,
356     struct nvme_feature_obj *,
357     struct nvme_command *,
358     struct nvme_completion *);
359 static void nvme_feature_iv_config(struct pci_nvme_softc *,
360     struct nvme_feature_obj *,
361     struct nvme_command *,
362     struct nvme_completion *);
363
364 static __inline void
365 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
366 {
367         size_t len;
368
369         len = strnlen(src, dst_size);
370         memset(dst, pad, dst_size);
371         memcpy(dst, src, len);
372 }
373
374 static __inline void
375 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
376 {
377
378         *status &= ~NVME_STATUS_MASK;
379         *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
380                 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
381 }
382
383 static __inline void
384 pci_nvme_status_genc(uint16_t *status, uint16_t code)
385 {
386
387         pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
388 }
389
390 /*
391  * Initialize the requested number or IO Submission and Completion Queues.
392  * Admin queues are allocated implicitly.
393  */
394 static void
395 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
396 {
397         uint32_t i;
398
399         /*
400          * Allocate and initialize the Submission Queues
401          */
402         if (nsq > NVME_QUEUES) {
403                 WPRINTF("%s: clamping number of SQ from %u to %u",
404                                         __func__, nsq, NVME_QUEUES);
405                 nsq = NVME_QUEUES;
406         }
407
408         sc->num_squeues = nsq;
409
410         sc->submit_queues = calloc(sc->num_squeues + 1,
411                                 sizeof(struct nvme_submission_queue));
412         if (sc->submit_queues == NULL) {
413                 WPRINTF("%s: SQ allocation failed", __func__);
414                 sc->num_squeues = 0;
415         } else {
416                 struct nvme_submission_queue *sq = sc->submit_queues;
417
418                 for (i = 0; i < sc->num_squeues; i++)
419                         pthread_mutex_init(&sq[i].mtx, NULL);
420         }
421
422         /*
423          * Allocate and initialize the Completion Queues
424          */
425         if (ncq > NVME_QUEUES) {
426                 WPRINTF("%s: clamping number of CQ from %u to %u",
427                                         __func__, ncq, NVME_QUEUES);
428                 ncq = NVME_QUEUES;
429         }
430
431         sc->num_cqueues = ncq;
432
433         sc->compl_queues = calloc(sc->num_cqueues + 1,
434                                 sizeof(struct nvme_completion_queue));
435         if (sc->compl_queues == NULL) {
436                 WPRINTF("%s: CQ allocation failed", __func__);
437                 sc->num_cqueues = 0;
438         } else {
439                 struct nvme_completion_queue *cq = sc->compl_queues;
440
441                 for (i = 0; i < sc->num_cqueues; i++)
442                         pthread_mutex_init(&cq[i].mtx, NULL);
443         }
444 }
445
446 static void
447 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
448 {
449         struct nvme_controller_data *cd = &sc->ctrldata;
450
451         cd->vid = 0xFB5D;
452         cd->ssvid = 0x0000;
453
454         cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
455         cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
456
457         /* Num of submission commands that we can handle at a time (2^rab) */
458         cd->rab   = 4;
459
460         /* FreeBSD OUI */
461         cd->ieee[0] = 0x58;
462         cd->ieee[1] = 0x9c;
463         cd->ieee[2] = 0xfc;
464
465         cd->mic = 0;
466
467         cd->mdts = NVME_MDTS;   /* max data transfer size (2^mdts * CAP.MPSMIN) */
468
469         cd->ver = 0x00010300;
470
471         cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
472         cd->acl = 2;
473         cd->aerl = 4;
474
475         /* Advertise 1, Read-only firmware slot */
476         cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
477             (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
478         cd->lpa = 0;    /* TODO: support some simple things like SMART */
479         cd->elpe = 0;   /* max error log page entries */
480         cd->npss = 1;   /* number of power states support */
481
482         /* Warning Composite Temperature Threshold */
483         cd->wctemp = 0x0157;
484
485         cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
486             (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
487         cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
488             (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
489         cd->nn = 1;     /* number of namespaces */
490
491         cd->oncs = 0;
492         switch (sc->dataset_management) {
493         case NVME_DATASET_MANAGEMENT_AUTO:
494                 if (sc->nvstore.deallocate)
495                         cd->oncs |= NVME_ONCS_DSM;
496                 break;
497         case NVME_DATASET_MANAGEMENT_ENABLE:
498                 cd->oncs |= NVME_ONCS_DSM;
499                 break;
500         default:
501                 break;
502         }
503
504         cd->fna = 0x03;
505
506         cd->power_state[0].mp = 10;
507 }
508
509 /*
510  * Calculate the CRC-16 of the given buffer
511  * See copyright attribution at top of file
512  */
513 static uint16_t
514 crc16(uint16_t crc, const void *buffer, unsigned int len)
515 {
516         const unsigned char *cp = buffer;
517         /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
518         static uint16_t const crc16_table[256] = {
519                 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
520                 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
521                 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
522                 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
523                 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
524                 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
525                 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
526                 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
527                 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
528                 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
529                 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
530                 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
531                 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
532                 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
533                 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
534                 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
535                 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
536                 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
537                 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
538                 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
539                 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
540                 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
541                 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
542                 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
543                 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
544                 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
545                 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
546                 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
547                 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
548                 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
549                 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
550                 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
551         };
552
553         while (len--)
554                 crc = (((crc >> 8) & 0xffU) ^
555                     crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
556         return crc;
557 }
558
559 static void
560 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
561     struct nvme_namespace_data *nd, uint32_t nsid,
562     struct pci_nvme_blockstore *nvstore)
563 {
564
565         /* Get capacity and block size information from backing store */
566         nd->nsze = nvstore->size / nvstore->sectsz;
567         nd->ncap = nd->nsze;
568         nd->nuse = nd->nsze;
569
570         if (nvstore->type == NVME_STOR_BLOCKIF)
571                 nvstore->deallocate = blockif_candelete(nvstore->ctx);
572
573         nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
574         nd->flbas = 0;
575
576         /* Create an EUI-64 if user did not provide one */
577         if (nvstore->eui64 == 0) {
578                 char *data = NULL;
579                 uint64_t eui64 = nvstore->eui64;
580
581                 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
582                     sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
583
584                 if (data != NULL) {
585                         eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
586                         free(data);
587                 }
588                 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
589         }
590         be64enc(nd->eui64, nvstore->eui64);
591
592         /* LBA data-sz = 2^lbads */
593         nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
594 }
595
596 static void
597 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
598 {
599
600         memset(&sc->err_log, 0, sizeof(sc->err_log));
601         memset(&sc->health_log, 0, sizeof(sc->health_log));
602         memset(&sc->fw_log, 0, sizeof(sc->fw_log));
603
604         /* Set read/write remainder to round up according to spec */
605         sc->read_dunits_remainder = 999;
606         sc->write_dunits_remainder = 999;
607 }
608
609 static void
610 pci_nvme_init_features(struct pci_nvme_softc *sc)
611 {
612
613         sc->feat[0].set = nvme_feature_invalid_cb;
614         sc->feat[0].get = nvme_feature_invalid_cb;
615
616         sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
617         sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
618         sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
619         sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set =
620             nvme_feature_iv_config;
621         sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get =
622             nvme_feature_invalid_cb;
623         sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get =
624             nvme_feature_invalid_cb;
625 }
626
627 static void
628 pci_nvme_aer_init(struct pci_nvme_softc *sc)
629 {
630
631         STAILQ_INIT(&sc->aer_list);
632         sc->aer_count = 0;
633 }
634
635 static void
636 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
637 {
638         struct pci_nvme_aer *aer = NULL;
639
640         while (!STAILQ_EMPTY(&sc->aer_list)) {
641                 aer = STAILQ_FIRST(&sc->aer_list);
642                 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
643                 free(aer);
644         }
645
646         pci_nvme_aer_init(sc);
647 }
648
649 static bool
650 pci_nvme_aer_available(struct pci_nvme_softc *sc)
651 {
652
653         return (!STAILQ_EMPTY(&sc->aer_list));
654 }
655
656 static bool
657 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
658 {
659         struct nvme_controller_data *cd = &sc->ctrldata;
660
661         /* AERL is a zero based value while aer_count is one's based */
662         return (sc->aer_count == (cd->aerl + 1));
663 }
664
665 /*
666  * Add an Async Event Request
667  *
668  * Stores an AER to be returned later if the Controller needs to notify the
669  * host of an event.
670  * Note that while the NVMe spec doesn't require Controllers to return AER's
671  * in order, this implementation does preserve the order.
672  */
673 static int
674 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
675 {
676         struct pci_nvme_aer *aer = NULL;
677
678         if (pci_nvme_aer_limit_reached(sc))
679                 return (-1);
680
681         aer = calloc(1, sizeof(struct pci_nvme_aer));
682         if (aer == NULL)
683                 return (-1);
684
685         sc->aer_count++;
686
687         /* Save the Command ID for use in the completion message */
688         aer->cid = cid;
689         STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
690
691         return (0);
692 }
693
694 /*
695  * Get an Async Event Request structure
696  *
697  * Returns a pointer to an AER previously submitted by the host or NULL if
698  * no AER's exist. Caller is responsible for freeing the returned struct.
699  */
700 static struct pci_nvme_aer *
701 pci_nvme_aer_get(struct pci_nvme_softc *sc)
702 {
703         struct pci_nvme_aer *aer = NULL;
704
705         aer = STAILQ_FIRST(&sc->aer_list);
706         if (aer != NULL) {
707                 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
708                 sc->aer_count--;
709         }
710         
711         return (aer);
712 }
713
714 static void
715 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
716 {
717         uint32_t i;
718
719         DPRINTF("%s", __func__);
720
721         sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
722             (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
723             (60 << NVME_CAP_LO_REG_TO_SHIFT);
724
725         sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
726
727         sc->regs.vs = 0x00010300;       /* NVMe v1.3 */
728
729         sc->regs.cc = 0;
730         sc->regs.csts = 0;
731
732         assert(sc->submit_queues != NULL);
733
734         for (i = 0; i < sc->num_squeues + 1; i++) {
735                 sc->submit_queues[i].qbase = NULL;
736                 sc->submit_queues[i].size = 0;
737                 sc->submit_queues[i].cqid = 0;
738                 sc->submit_queues[i].tail = 0;
739                 sc->submit_queues[i].head = 0;
740         }
741
742         assert(sc->compl_queues != NULL);
743
744         for (i = 0; i < sc->num_cqueues + 1; i++) {
745                 sc->compl_queues[i].qbase = NULL;
746                 sc->compl_queues[i].size = 0;
747                 sc->compl_queues[i].tail = 0;
748                 sc->compl_queues[i].head = 0;
749         }
750
751         sc->num_q_is_set = false;
752
753         pci_nvme_aer_destroy(sc);
754 }
755
756 static void
757 pci_nvme_reset(struct pci_nvme_softc *sc)
758 {
759         pthread_mutex_lock(&sc->mtx);
760         pci_nvme_reset_locked(sc);
761         pthread_mutex_unlock(&sc->mtx);
762 }
763
764 static void
765 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
766 {
767         uint16_t acqs, asqs;
768
769         DPRINTF("%s", __func__);
770
771         asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
772         sc->submit_queues[0].size = asqs;
773         sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
774                     sizeof(struct nvme_command) * asqs);
775
776         DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
777                 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
778
779         acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 
780             NVME_AQA_REG_ACQS_MASK) + 1;
781         sc->compl_queues[0].size = acqs;
782         sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
783                  sizeof(struct nvme_completion) * acqs);
784         sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
785
786         DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
787                 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
788 }
789
790 static int
791 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
792         size_t len, enum nvme_copy_dir dir)
793 {
794         uint8_t *p;
795         size_t bytes;
796
797         if (len > (8 * 1024)) {
798                 return (-1);
799         }
800
801         /* Copy from the start of prp1 to the end of the physical page */
802         bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
803         bytes = MIN(bytes, len);
804
805         p = vm_map_gpa(ctx, prp1, bytes);
806         if (p == NULL) {
807                 return (-1);
808         }
809
810         if (dir == NVME_COPY_TO_PRP)
811                 memcpy(p, b, bytes);
812         else
813                 memcpy(b, p, bytes);
814
815         b += bytes;
816
817         len -= bytes;
818         if (len == 0) {
819                 return (0);
820         }
821
822         len = MIN(len, PAGE_SIZE);
823
824         p = vm_map_gpa(ctx, prp2, len);
825         if (p == NULL) {
826                 return (-1);
827         }
828
829         if (dir == NVME_COPY_TO_PRP)
830                 memcpy(p, b, len);
831         else
832                 memcpy(b, p, len);
833
834         return (0);
835 }
836
837 /*
838  * Write a Completion Queue Entry update
839  *
840  * Write the completion and update the doorbell value
841  */
842 static void
843 pci_nvme_cq_update(struct pci_nvme_softc *sc,
844                 struct nvme_completion_queue *cq,
845                 uint32_t cdw0,
846                 uint16_t cid,
847                 uint16_t sqid,
848                 uint16_t status)
849 {
850         struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
851         struct nvme_completion *cqe;
852
853         assert(cq->qbase != NULL);
854
855         pthread_mutex_lock(&cq->mtx);
856
857         cqe = &cq->qbase[cq->tail];
858
859         /* Flip the phase bit */
860         status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
861
862         cqe->cdw0 = cdw0;
863         cqe->sqhd = sq->head;
864         cqe->sqid = sqid;
865         cqe->cid = cid;
866         cqe->status = status;
867
868         cq->tail++;
869         if (cq->tail >= cq->size) {
870                 cq->tail = 0;
871         }
872
873         pthread_mutex_unlock(&cq->mtx);
874 }
875
876 static int
877 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
878         struct nvme_completion* compl)
879 {
880         uint16_t qid = command->cdw10 & 0xffff;
881
882         DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
883         if (qid == 0 || qid > sc->num_squeues ||
884             (sc->submit_queues[qid].qbase == NULL)) {
885                 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
886                         __func__, qid, sc->num_squeues);
887                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
888                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
889                 return (1);
890         }
891
892         sc->submit_queues[qid].qbase = NULL;
893         sc->submit_queues[qid].cqid = 0;
894         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
895         return (1);
896 }
897
898 static int
899 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
900         struct nvme_completion* compl)
901 {
902         if (command->cdw11 & NVME_CMD_CDW11_PC) {
903                 uint16_t qid = command->cdw10 & 0xffff;
904                 struct nvme_submission_queue *nsq;
905
906                 if ((qid == 0) || (qid > sc->num_squeues) ||
907                     (sc->submit_queues[qid].qbase != NULL)) {
908                         WPRINTF("%s queue index %u > num_squeues %u",
909                                 __func__, qid, sc->num_squeues);
910                         pci_nvme_status_tc(&compl->status,
911                             NVME_SCT_COMMAND_SPECIFIC,
912                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
913                         return (1);
914                 }
915
916                 nsq = &sc->submit_queues[qid];
917                 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
918                 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
919                 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
920                         /*
921                          * Queues must specify at least two entries
922                          * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
923                          * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
924                          */
925                         pci_nvme_status_tc(&compl->status,
926                             NVME_SCT_COMMAND_SPECIFIC,
927                             NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
928                         return (1);
929                 }
930
931                 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
932                 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
933                         pci_nvme_status_tc(&compl->status,
934                             NVME_SCT_COMMAND_SPECIFIC,
935                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
936                         return (1);
937                 }
938
939                 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
940                         pci_nvme_status_tc(&compl->status,
941                             NVME_SCT_COMMAND_SPECIFIC,
942                             NVME_SC_COMPLETION_QUEUE_INVALID);
943                         return (1);
944                 }
945
946                 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
947
948                 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
949                               sizeof(struct nvme_command) * (size_t)nsq->size);
950
951                 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
952                         qid, nsq->size, nsq->qbase, nsq->cqid);
953
954                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
955
956                 DPRINTF("%s completed creating IOSQ qid %u",
957                          __func__, qid);
958         } else {
959                 /* 
960                  * Guest sent non-cont submission queue request.
961                  * This setting is unsupported by this emulation.
962                  */
963                 WPRINTF("%s unsupported non-contig (list-based) "
964                          "create i/o submission queue", __func__);
965
966                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
967         }
968         return (1);
969 }
970
971 static int
972 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
973         struct nvme_completion* compl)
974 {
975         uint16_t qid = command->cdw10 & 0xffff;
976         uint16_t sqid;
977
978         DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
979         if (qid == 0 || qid > sc->num_cqueues ||
980             (sc->compl_queues[qid].qbase == NULL)) {
981                 WPRINTF("%s queue index %u / num_cqueues %u",
982                         __func__, qid, sc->num_cqueues);
983                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
984                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
985                 return (1);
986         }
987
988         /* Deleting an Active CQ is an error */
989         for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
990                 if (sc->submit_queues[sqid].cqid == qid) {
991                         pci_nvme_status_tc(&compl->status,
992                             NVME_SCT_COMMAND_SPECIFIC,
993                             NVME_SC_INVALID_QUEUE_DELETION);
994                         return (1);
995                 }
996
997         sc->compl_queues[qid].qbase = NULL;
998         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
999         return (1);
1000 }
1001
1002 static int
1003 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1004         struct nvme_completion* compl)
1005 {
1006         struct nvme_completion_queue *ncq;
1007         uint16_t qid = command->cdw10 & 0xffff;
1008
1009         /* Only support Physically Contiguous queues */
1010         if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1011                 WPRINTF("%s unsupported non-contig (list-based) "
1012                          "create i/o completion queue",
1013                          __func__);
1014
1015                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1016                 return (1);
1017         }
1018
1019         if ((qid == 0) || (qid > sc->num_cqueues) ||
1020             (sc->compl_queues[qid].qbase != NULL)) {
1021                 WPRINTF("%s queue index %u > num_cqueues %u",
1022                         __func__, qid, sc->num_cqueues);
1023                 pci_nvme_status_tc(&compl->status,
1024                     NVME_SCT_COMMAND_SPECIFIC,
1025                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
1026                 return (1);
1027         }
1028
1029         ncq = &sc->compl_queues[qid];
1030         ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1031         ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1032         if (ncq->intr_vec > (sc->max_queues + 1)) {
1033                 pci_nvme_status_tc(&compl->status,
1034                     NVME_SCT_COMMAND_SPECIFIC,
1035                     NVME_SC_INVALID_INTERRUPT_VECTOR);
1036                 return (1);
1037         }
1038
1039         ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1040         if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1041                 /*
1042                  * Queues must specify at least two entries
1043                  * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1044                  * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1045                  */
1046                 pci_nvme_status_tc(&compl->status,
1047                     NVME_SCT_COMMAND_SPECIFIC,
1048                     NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1049                 return (1);
1050         }
1051         ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1052                      command->prp1,
1053                      sizeof(struct nvme_command) * (size_t)ncq->size);
1054
1055         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1056
1057
1058         return (1);
1059 }
1060
1061 static int
1062 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1063         struct nvme_completion* compl)
1064 {
1065         uint32_t logsize;
1066         uint8_t logpage = command->cdw10 & 0xFF;
1067
1068         DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1069
1070         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1071
1072         /*
1073          * Command specifies the number of dwords to return in fields NUMDU
1074          * and NUMDL. This is a zero-based value.
1075          */
1076         logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1077         logsize *= sizeof(uint32_t);
1078
1079         switch (logpage) {
1080         case NVME_LOG_ERROR:
1081                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1082                     command->prp2, (uint8_t *)&sc->err_log,
1083                     MIN(logsize, sizeof(sc->err_log)),
1084                     NVME_COPY_TO_PRP);
1085                 break;
1086         case NVME_LOG_HEALTH_INFORMATION:
1087                 pthread_mutex_lock(&sc->mtx);
1088                 memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1089                     sizeof(sc->health_log.data_units_read));
1090                 memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1091                     sizeof(sc->health_log.data_units_written));
1092                 memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1093                     sizeof(sc->health_log.host_read_commands));
1094                 memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1095                     sizeof(sc->health_log.host_write_commands));
1096                 pthread_mutex_unlock(&sc->mtx);
1097
1098                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1099                     command->prp2, (uint8_t *)&sc->health_log,
1100                     MIN(logsize, sizeof(sc->health_log)),
1101                     NVME_COPY_TO_PRP);
1102                 break;
1103         case NVME_LOG_FIRMWARE_SLOT:
1104                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1105                     command->prp2, (uint8_t *)&sc->fw_log,
1106                     MIN(logsize, sizeof(sc->fw_log)),
1107                     NVME_COPY_TO_PRP);
1108                 break;
1109         default:
1110                 DPRINTF("%s get log page %x command not supported",
1111                         __func__, logpage);
1112
1113                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1114                     NVME_SC_INVALID_LOG_PAGE);
1115         }
1116
1117         return (1);
1118 }
1119
1120 static int
1121 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1122         struct nvme_completion* compl)
1123 {
1124         void *dest;
1125         uint16_t status;
1126
1127         DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1128                 command->cdw10 & 0xFF, command->nsid);
1129
1130         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1131
1132         switch (command->cdw10 & 0xFF) {
1133         case 0x00: /* return Identify Namespace data structure */
1134                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1135                     command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1136                     NVME_COPY_TO_PRP);
1137                 break;
1138         case 0x01: /* return Identify Controller data structure */
1139                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1140                     command->prp2, (uint8_t *)&sc->ctrldata,
1141                     sizeof(sc->ctrldata),
1142                     NVME_COPY_TO_PRP);
1143                 break;
1144         case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1145                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1146                                   sizeof(uint32_t) * 1024);
1147                 ((uint32_t *)dest)[0] = 1;
1148                 ((uint32_t *)dest)[1] = 0;
1149                 break;
1150         case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1151                 if (command->nsid != 1) {
1152                         pci_nvme_status_genc(&status,
1153                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1154                         break;
1155                 }
1156                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1157                                   sizeof(uint32_t) * 1024);
1158                 /* All bytes after the descriptor shall be zero */
1159                 bzero(dest, sizeof(uint32_t) * 1024);
1160
1161                 /* Return NIDT=1 (i.e. EUI64) descriptor */
1162                 ((uint8_t *)dest)[0] = 1;
1163                 ((uint8_t *)dest)[1] = sizeof(uint64_t);
1164                 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1165                 break;
1166         default:
1167                 DPRINTF("%s unsupported identify command requested 0x%x",
1168                          __func__, command->cdw10 & 0xFF);
1169                 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1170                 break;
1171         }
1172
1173         compl->status = status;
1174         return (1);
1175 }
1176
1177 static const char *
1178 nvme_fid_to_name(uint8_t fid)
1179 {
1180         const char *name;
1181
1182         switch (fid) {
1183         case NVME_FEAT_ARBITRATION:
1184                 name = "Arbitration";
1185                 break;
1186         case NVME_FEAT_POWER_MANAGEMENT:
1187                 name = "Power Management";
1188                 break;
1189         case NVME_FEAT_LBA_RANGE_TYPE:
1190                 name = "LBA Range Type";
1191                 break;
1192         case NVME_FEAT_TEMPERATURE_THRESHOLD:
1193                 name = "Temperature Threshold";
1194                 break;
1195         case NVME_FEAT_ERROR_RECOVERY:
1196                 name = "Error Recovery";
1197                 break;
1198         case NVME_FEAT_VOLATILE_WRITE_CACHE:
1199                 name = "Volatile Write Cache";
1200                 break;
1201         case NVME_FEAT_NUMBER_OF_QUEUES:
1202                 name = "Number of Queues";
1203                 break;
1204         case NVME_FEAT_INTERRUPT_COALESCING:
1205                 name = "Interrupt Coalescing";
1206                 break;
1207         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1208                 name = "Interrupt Vector Configuration";
1209                 break;
1210         case NVME_FEAT_WRITE_ATOMICITY:
1211                 name = "Write Atomicity Normal";
1212                 break;
1213         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1214                 name = "Asynchronous Event Configuration";
1215                 break;
1216         case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1217                 name = "Autonomous Power State Transition";
1218                 break;
1219         case NVME_FEAT_HOST_MEMORY_BUFFER:
1220                 name = "Host Memory Buffer";
1221                 break;
1222         case NVME_FEAT_TIMESTAMP:
1223                 name = "Timestamp";
1224                 break;
1225         case NVME_FEAT_KEEP_ALIVE_TIMER:
1226                 name = "Keep Alive Timer";
1227                 break;
1228         case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1229                 name = "Host Controlled Thermal Management";
1230                 break;
1231         case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1232                 name = "Non-Operation Power State Config";
1233                 break;
1234         case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1235                 name = "Read Recovery Level Config";
1236                 break;
1237         case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1238                 name = "Predictable Latency Mode Config";
1239                 break;
1240         case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1241                 name = "Predictable Latency Mode Window";
1242                 break;
1243         case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1244                 name = "LBA Status Information Report Interval";
1245                 break;
1246         case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1247                 name = "Host Behavior Support";
1248                 break;
1249         case NVME_FEAT_SANITIZE_CONFIG:
1250                 name = "Sanitize Config";
1251                 break;
1252         case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1253                 name = "Endurance Group Event Configuration";
1254                 break;
1255         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1256                 name = "Software Progress Marker";
1257                 break;
1258         case NVME_FEAT_HOST_IDENTIFIER:
1259                 name = "Host Identifier";
1260                 break;
1261         case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1262                 name = "Reservation Notification Mask";
1263                 break;
1264         case NVME_FEAT_RESERVATION_PERSISTENCE:
1265                 name = "Reservation Persistence";
1266                 break;
1267         case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1268                 name = "Namespace Write Protection Config";
1269                 break;
1270         default:
1271                 name = "Unknown";
1272                 break;
1273         }
1274
1275         return (name);
1276 }
1277
1278 static void
1279 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1280     struct nvme_feature_obj *feat,
1281     struct nvme_command *command,
1282     struct nvme_completion *compl)
1283 {
1284
1285         pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1286 }
1287
1288 static void
1289 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1290     struct nvme_feature_obj *feat,
1291     struct nvme_command *command,
1292     struct nvme_completion *compl)
1293 {
1294         uint32_t i;
1295         uint32_t cdw11 = command->cdw11;
1296         uint16_t iv;
1297         bool cd;
1298
1299         pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1300
1301         iv = cdw11 & 0xffff;
1302         cd = cdw11 & (1 << 16);
1303
1304         if (iv > (sc->max_queues + 1)) {
1305                 return;
1306         }
1307
1308         /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1309         if ((iv == 0) && !cd)
1310                 return;
1311
1312         /* Requested Interrupt Vector must be used by a CQ */
1313         for (i = 0; i < sc->num_cqueues + 1; i++) {
1314                 if (sc->compl_queues[i].intr_vec == iv) {
1315                         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1316                 }
1317         }
1318
1319 }
1320
1321 static void
1322 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1323     struct nvme_feature_obj *feat,
1324     struct nvme_command *command,
1325     struct nvme_completion *compl)
1326 {
1327         uint16_t nqr;   /* Number of Queues Requested */
1328
1329         if (sc->num_q_is_set) {
1330                 WPRINTF("%s: Number of Queues already set", __func__);
1331                 pci_nvme_status_genc(&compl->status,
1332                     NVME_SC_COMMAND_SEQUENCE_ERROR);
1333                 return;
1334         }
1335
1336         nqr = command->cdw11 & 0xFFFF;
1337         if (nqr == 0xffff) {
1338                 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1339                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1340                 return;
1341         }
1342
1343         sc->num_squeues = ONE_BASED(nqr);
1344         if (sc->num_squeues > sc->max_queues) {
1345                 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1346                                         sc->max_queues);
1347                 sc->num_squeues = sc->max_queues;
1348         }
1349
1350         nqr = (command->cdw11 >> 16) & 0xFFFF;
1351         if (nqr == 0xffff) {
1352                 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1353                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1354                 return;
1355         }
1356
1357         sc->num_cqueues = ONE_BASED(nqr);
1358         if (sc->num_cqueues > sc->max_queues) {
1359                 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1360                                         sc->max_queues);
1361                 sc->num_cqueues = sc->max_queues;
1362         }
1363
1364         /* Patch the command value which will be saved on callback's return */
1365         command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1366         compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1367
1368         sc->num_q_is_set = true;
1369 }
1370
1371 static int
1372 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1373         struct nvme_completion *compl)
1374 {
1375         struct nvme_feature_obj *feat;
1376         uint32_t nsid = command->nsid;
1377         uint8_t fid = command->cdw10 & 0xFF;
1378
1379         DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1380
1381         if (fid >= NVME_FID_MAX) {
1382                 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1383                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1384                 return (1);
1385         }
1386         feat = &sc->feat[fid];
1387
1388         if (!feat->namespace_specific &&
1389             !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1390                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1391                     NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1392                 return (1);
1393         }
1394
1395         compl->cdw0 = 0;
1396         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1397
1398         if (feat->set)
1399                 feat->set(sc, feat, command, compl);
1400
1401         if (compl->status == NVME_SC_SUCCESS)
1402                 feat->cdw11 = command->cdw11;
1403
1404         return (0);
1405 }
1406
1407 static int
1408 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1409         struct nvme_completion* compl)
1410 {
1411         struct nvme_feature_obj *feat;
1412         uint8_t fid = command->cdw10 & 0xFF;
1413
1414         DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1415
1416         if (fid >= NVME_FID_MAX) {
1417                 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1418                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1419                 return (1);
1420         }
1421
1422         compl->cdw0 = 0;
1423         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1424
1425         feat = &sc->feat[fid];
1426         if (feat->get) {
1427                 feat->get(sc, feat, command, compl);
1428         }
1429
1430         if (compl->status == NVME_SC_SUCCESS) {
1431                 compl->cdw0 = feat->cdw11;
1432         }
1433
1434         return (0);
1435 }
1436
1437 static int
1438 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1439         struct nvme_completion* compl)
1440 {
1441         uint8_t ses, lbaf, pi;
1442
1443         /* Only supports Secure Erase Setting - User Data Erase */
1444         ses = (command->cdw10 >> 9) & 0x7;
1445         if (ses > 0x1) {
1446                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1447                 return (1);
1448         }
1449
1450         /* Only supports a single LBA Format */
1451         lbaf = command->cdw10 & 0xf;
1452         if (lbaf != 0) {
1453                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1454                     NVME_SC_INVALID_FORMAT);
1455                 return (1);
1456         }
1457
1458         /* Doesn't support Protection Infomation */
1459         pi = (command->cdw10 >> 5) & 0x7;
1460         if (pi != 0) {
1461                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1462                 return (1);
1463         }
1464
1465         if (sc->nvstore.type == NVME_STOR_RAM) {
1466                 if (sc->nvstore.ctx)
1467                         free(sc->nvstore.ctx);
1468                 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1469                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1470         } else {
1471                 struct pci_nvme_ioreq *req;
1472                 int err;
1473
1474                 req = pci_nvme_get_ioreq(sc);
1475                 if (req == NULL) {
1476                         pci_nvme_status_genc(&compl->status,
1477                             NVME_SC_INTERNAL_DEVICE_ERROR);
1478                         WPRINTF("%s: unable to allocate IO req", __func__);
1479                         return (1);
1480                 }
1481                 req->nvme_sq = &sc->submit_queues[0];
1482                 req->sqid = 0;
1483                 req->opc = command->opc;
1484                 req->cid = command->cid;
1485                 req->nsid = command->nsid;
1486
1487                 req->io_req.br_offset = 0;
1488                 req->io_req.br_resid = sc->nvstore.size;
1489                 req->io_req.br_callback = pci_nvme_io_done;
1490
1491                 err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1492                 if (err) {
1493                         pci_nvme_status_genc(&compl->status,
1494                             NVME_SC_INTERNAL_DEVICE_ERROR);
1495                         pci_nvme_release_ioreq(sc, req);
1496                 }
1497         }
1498
1499         return (1);
1500 }
1501
1502 static int
1503 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1504         struct nvme_completion* compl)
1505 {
1506         DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1507                 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1508
1509         /* TODO: search for the command ID and abort it */
1510
1511         compl->cdw0 = 1;
1512         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1513         return (1);
1514 }
1515
1516 static int
1517 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1518         struct nvme_command* command, struct nvme_completion* compl)
1519 {
1520         DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1521
1522         /* Don't exceed the Async Event Request Limit (AERL). */
1523         if (pci_nvme_aer_limit_reached(sc)) {
1524                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1525                                 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1526                 return (1);
1527         }
1528
1529         if (pci_nvme_aer_add(sc, command->cid)) {
1530                 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1531                                 NVME_SC_INTERNAL_DEVICE_ERROR);
1532                 return (1);
1533         }
1534
1535         /*
1536          * Raise events when they happen based on the Set Features cmd.
1537          * These events happen async, so only set completion successful if
1538          * there is an event reflective of the request to get event.
1539          */
1540         compl->status = NVME_NO_STATUS;
1541
1542         return (0);
1543 }
1544
1545 static void
1546 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1547 {
1548         struct nvme_completion compl;
1549         struct nvme_command *cmd;
1550         struct nvme_submission_queue *sq;
1551         struct nvme_completion_queue *cq;
1552         uint16_t sqhead;
1553
1554         DPRINTF("%s index %u", __func__, (uint32_t)value);
1555
1556         sq = &sc->submit_queues[0];
1557         cq = &sc->compl_queues[0];
1558
1559         pthread_mutex_lock(&sq->mtx);
1560
1561         sqhead = sq->head;
1562         DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1563         
1564         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1565                 cmd = &(sq->qbase)[sqhead];
1566                 compl.cdw0 = 0;
1567                 compl.status = 0;
1568
1569                 switch (cmd->opc) {
1570                 case NVME_OPC_DELETE_IO_SQ:
1571                         DPRINTF("%s command DELETE_IO_SQ", __func__);
1572                         nvme_opc_delete_io_sq(sc, cmd, &compl);
1573                         break;
1574                 case NVME_OPC_CREATE_IO_SQ:
1575                         DPRINTF("%s command CREATE_IO_SQ", __func__);
1576                         nvme_opc_create_io_sq(sc, cmd, &compl);
1577                         break;
1578                 case NVME_OPC_DELETE_IO_CQ:
1579                         DPRINTF("%s command DELETE_IO_CQ", __func__);
1580                         nvme_opc_delete_io_cq(sc, cmd, &compl);
1581                         break;
1582                 case NVME_OPC_CREATE_IO_CQ:
1583                         DPRINTF("%s command CREATE_IO_CQ", __func__);
1584                         nvme_opc_create_io_cq(sc, cmd, &compl);
1585                         break;
1586                 case NVME_OPC_GET_LOG_PAGE:
1587                         DPRINTF("%s command GET_LOG_PAGE", __func__);
1588                         nvme_opc_get_log_page(sc, cmd, &compl);
1589                         break;
1590                 case NVME_OPC_IDENTIFY:
1591                         DPRINTF("%s command IDENTIFY", __func__);
1592                         nvme_opc_identify(sc, cmd, &compl);
1593                         break;
1594                 case NVME_OPC_ABORT:
1595                         DPRINTF("%s command ABORT", __func__);
1596                         nvme_opc_abort(sc, cmd, &compl);
1597                         break;
1598                 case NVME_OPC_SET_FEATURES:
1599                         DPRINTF("%s command SET_FEATURES", __func__);
1600                         nvme_opc_set_features(sc, cmd, &compl);
1601                         break;
1602                 case NVME_OPC_GET_FEATURES:
1603                         DPRINTF("%s command GET_FEATURES", __func__);
1604                         nvme_opc_get_features(sc, cmd, &compl);
1605                         break;
1606                 case NVME_OPC_FIRMWARE_ACTIVATE:
1607                         DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
1608                         pci_nvme_status_tc(&compl.status,
1609                             NVME_SCT_COMMAND_SPECIFIC,
1610                             NVME_SC_INVALID_FIRMWARE_SLOT);
1611                         break;
1612                 case NVME_OPC_ASYNC_EVENT_REQUEST:
1613                         DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1614                         nvme_opc_async_event_req(sc, cmd, &compl);
1615                         break;
1616                 case NVME_OPC_FORMAT_NVM:
1617                         DPRINTF("%s command FORMAT_NVM", __func__);
1618                         if ((sc->ctrldata.oacs &
1619                             (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1620                                 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1621                         }
1622                         compl.status = NVME_NO_STATUS;
1623                         nvme_opc_format_nvm(sc, cmd, &compl);
1624                         break;
1625                 default:
1626                         DPRINTF("0x%x command is not implemented",
1627                             cmd->opc);
1628                         pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1629                 }
1630                 sqhead = (sqhead + 1) % sq->size;
1631
1632                 if (NVME_COMPLETION_VALID(compl)) {
1633                         pci_nvme_cq_update(sc, &sc->compl_queues[0],
1634                             compl.cdw0,
1635                             cmd->cid,
1636                             0,          /* SQID */
1637                             compl.status);
1638                 }
1639         }
1640
1641         DPRINTF("setting sqhead %u", sqhead);
1642         sq->head = sqhead;
1643
1644         if (cq->head != cq->tail)
1645                 pci_generate_msix(sc->nsc_pi, 0);
1646
1647         pthread_mutex_unlock(&sq->mtx);
1648 }
1649
1650 /*
1651  * Update the Write and Read statistics reported in SMART data
1652  *
1653  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
1654  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
1655  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
1656  */
1657 static void
1658 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
1659     size_t bytes, uint16_t status)
1660 {
1661
1662         pthread_mutex_lock(&sc->mtx);
1663         switch (opc) {
1664         case NVME_OPC_WRITE:
1665                 sc->write_commands++;
1666                 if (status != NVME_SC_SUCCESS)
1667                         break;
1668                 sc->write_dunits_remainder += (bytes / 512);
1669                 while (sc->write_dunits_remainder >= 1000) {
1670                         sc->write_data_units++;
1671                         sc->write_dunits_remainder -= 1000;
1672                 }
1673                 break;
1674         case NVME_OPC_READ:
1675                 sc->read_commands++;
1676                 if (status != NVME_SC_SUCCESS)
1677                         break;
1678                 sc->read_dunits_remainder += (bytes / 512);
1679                 while (sc->read_dunits_remainder >= 1000) {
1680                         sc->read_data_units++;
1681                         sc->read_dunits_remainder -= 1000;
1682                 }
1683                 break;
1684         default:
1685                 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
1686                 break;
1687         }
1688         pthread_mutex_unlock(&sc->mtx);
1689 }
1690
1691 /*
1692  * Check if the combination of Starting LBA (slba) and Number of Logical
1693  * Blocks (nlb) exceeds the range of the underlying storage.
1694  *
1695  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
1696  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
1697  * overflow.
1698  */
1699 static bool
1700 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
1701     uint32_t nlb)
1702 {
1703         size_t  offset, bytes;
1704
1705         /* Overflow check of multiplying Starting LBA by the sector size */
1706         if (slba >> (64 - nvstore->sectsz_bits))
1707                 return (true);
1708
1709         offset = slba << nvstore->sectsz_bits;
1710         bytes = nlb << nvstore->sectsz_bits;
1711
1712         /* Overflow check of Number of Logical Blocks */
1713         if ((nvstore->size - offset) < bytes)
1714                 return (true);
1715
1716         return (false);
1717 }
1718
1719 static int
1720 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1721         uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1722 {
1723         int iovidx;
1724
1725         if (req == NULL)
1726                 return (-1);
1727
1728         if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
1729                 return (-1);
1730         }
1731
1732         /* concatenate contig block-iovs to minimize number of iovs */
1733         if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1734                 iovidx = req->io_req.br_iovcnt - 1;
1735
1736                 req->io_req.br_iov[iovidx].iov_base =
1737                     paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1738                                      req->prev_gpaddr, size);
1739
1740                 req->prev_size += size;
1741                 req->io_req.br_resid += size;
1742
1743                 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1744         } else {
1745                 iovidx = req->io_req.br_iovcnt;
1746                 if (iovidx == 0) {
1747                         req->io_req.br_offset = lba;
1748                         req->io_req.br_resid = 0;
1749                         req->io_req.br_param = req;
1750                 }
1751
1752                 req->io_req.br_iov[iovidx].iov_base =
1753                     paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1754                                      gpaddr, size);
1755
1756                 req->io_req.br_iov[iovidx].iov_len = size;
1757
1758                 req->prev_gpaddr = gpaddr;
1759                 req->prev_size = size;
1760                 req->io_req.br_resid += size;
1761
1762                 req->io_req.br_iovcnt++;
1763         }
1764
1765         return (0);
1766 }
1767
1768 static void
1769 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1770         struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1771         uint32_t cdw0, uint16_t status)
1772 {
1773         struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1774
1775         DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1776                  __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1777                  NVME_STATUS_GET_SC(status));
1778
1779         pci_nvme_cq_update(sc, cq,
1780             0,          /* CDW0 */
1781             cid,
1782             sqid,
1783             status);
1784
1785         if (cq->head != cq->tail) {
1786                 if (cq->intr_en & NVME_CQ_INTEN) {
1787                         pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1788                 } else {
1789                         DPRINTF("%s: CQ%u interrupt disabled",
1790                                                 __func__, sq->cqid);
1791                 }
1792         }
1793 }
1794
1795 static void
1796 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1797 {
1798         req->sc = NULL;
1799         req->nvme_sq = NULL;
1800         req->sqid = 0;
1801
1802         pthread_mutex_lock(&sc->mtx);
1803
1804         STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1805         sc->pending_ios--;
1806
1807         /* when no more IO pending, can set to ready if device reset/enabled */
1808         if (sc->pending_ios == 0 &&
1809             NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1810                 sc->regs.csts |= NVME_CSTS_RDY;
1811
1812         pthread_mutex_unlock(&sc->mtx);
1813
1814         sem_post(&sc->iosemlock);
1815 }
1816
1817 static struct pci_nvme_ioreq *
1818 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1819 {
1820         struct pci_nvme_ioreq *req = NULL;;
1821
1822         sem_wait(&sc->iosemlock);
1823         pthread_mutex_lock(&sc->mtx);
1824
1825         req = STAILQ_FIRST(&sc->ioreqs_free);
1826         assert(req != NULL);
1827         STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1828
1829         req->sc = sc;
1830
1831         sc->pending_ios++;
1832
1833         pthread_mutex_unlock(&sc->mtx);
1834
1835         req->io_req.br_iovcnt = 0;
1836         req->io_req.br_offset = 0;
1837         req->io_req.br_resid = 0;
1838         req->io_req.br_param = req;
1839         req->prev_gpaddr = 0;
1840         req->prev_size = 0;
1841
1842         return req;
1843 }
1844
1845 static void
1846 pci_nvme_io_done(struct blockif_req *br, int err)
1847 {
1848         struct pci_nvme_ioreq *req = br->br_param;
1849         struct nvme_submission_queue *sq = req->nvme_sq;
1850         uint16_t code, status;
1851
1852         DPRINTF("%s error %d %s", __func__, err, strerror(err));
1853
1854         /* TODO return correct error */
1855         code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1856         pci_nvme_status_genc(&status, code);
1857
1858         pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1859         pci_nvme_stats_write_read_update(req->sc, req->opc,
1860             req->bytes, status);
1861         pci_nvme_release_ioreq(req->sc, req);
1862 }
1863
1864 /*
1865  * Implements the Flush command. The specification states:
1866  *    If a volatile write cache is not present, Flush commands complete
1867  *    successfully and have no effect
1868  * in the description of the Volatile Write Cache (VWC) field of the Identify
1869  * Controller data. Therefore, set status to Success if the command is
1870  * not supported (i.e. RAM or as indicated by the blockif).
1871  */
1872 static bool
1873 nvme_opc_flush(struct pci_nvme_softc *sc,
1874     struct nvme_command *cmd,
1875     struct pci_nvme_blockstore *nvstore,
1876     struct pci_nvme_ioreq *req,
1877     uint16_t *status)
1878 {
1879         bool pending = false;
1880
1881         if (nvstore->type == NVME_STOR_RAM) {
1882                 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1883         } else {
1884                 int err;
1885
1886                 req->io_req.br_callback = pci_nvme_io_done;
1887
1888                 err = blockif_flush(nvstore->ctx, &req->io_req);
1889                 switch (err) {
1890                 case 0:
1891                         pending = true;
1892                         break;
1893                 case EOPNOTSUPP:
1894                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1895                         break;
1896                 default:
1897                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1898                 }
1899         }
1900
1901         return (pending);
1902 }
1903
1904 static uint16_t
1905 nvme_write_read_ram(struct pci_nvme_softc *sc,
1906     struct pci_nvme_blockstore *nvstore,
1907     uint64_t prp1, uint64_t prp2,
1908     size_t offset, uint64_t bytes,
1909     bool is_write)
1910 {
1911         uint8_t *buf = nvstore->ctx;
1912         enum nvme_copy_dir dir;
1913         uint16_t status;
1914
1915         if (is_write)
1916                 dir = NVME_COPY_TO_PRP;
1917         else
1918                 dir = NVME_COPY_FROM_PRP;
1919
1920         if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
1921             buf + offset, bytes, dir))
1922                 pci_nvme_status_genc(&status,
1923                     NVME_SC_DATA_TRANSFER_ERROR);
1924         else
1925                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1926
1927         return (status);
1928 }
1929
1930 static uint16_t
1931 nvme_write_read_blockif(struct pci_nvme_softc *sc,
1932     struct pci_nvme_blockstore *nvstore,
1933     struct pci_nvme_ioreq *req,
1934     uint64_t prp1, uint64_t prp2,
1935     size_t offset, uint64_t bytes,
1936     bool is_write)
1937 {
1938         uint64_t size;
1939         int err;
1940         uint16_t status = NVME_NO_STATUS;
1941
1942         size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
1943         if (pci_nvme_append_iov_req(sc, req, prp1,
1944             size, is_write, offset)) {
1945                 pci_nvme_status_genc(&status,
1946                     NVME_SC_DATA_TRANSFER_ERROR);
1947                 goto out;
1948         }
1949
1950         offset += size;
1951         bytes  -= size;
1952
1953         if (bytes == 0) {
1954                 ;
1955         } else if (bytes <= PAGE_SIZE) {
1956                 size = bytes;
1957                 if (pci_nvme_append_iov_req(sc, req, prp2,
1958                     size, is_write, offset)) {
1959                         pci_nvme_status_genc(&status,
1960                             NVME_SC_DATA_TRANSFER_ERROR);
1961                         goto out;
1962                 }
1963         } else {
1964                 void *vmctx = sc->nsc_pi->pi_vmctx;
1965                 uint64_t *prp_list = &prp2;
1966                 uint64_t *last = prp_list;
1967
1968                 /* PRP2 is pointer to a physical region page list */
1969                 while (bytes) {
1970                         /* Last entry in list points to the next list */
1971                         if (prp_list == last) {
1972                                 uint64_t prp = *prp_list;
1973
1974                                 prp_list = paddr_guest2host(vmctx, prp,
1975                                     PAGE_SIZE - (prp % PAGE_SIZE));
1976                                 last = prp_list + (NVME_PRP2_ITEMS - 1);
1977                         }
1978
1979                         size = MIN(bytes, PAGE_SIZE);
1980
1981                         if (pci_nvme_append_iov_req(sc, req, *prp_list,
1982                             size, is_write, offset)) {
1983                                 pci_nvme_status_genc(&status,
1984                                     NVME_SC_DATA_TRANSFER_ERROR);
1985                                 goto out;
1986                         }
1987
1988                         offset += size;
1989                         bytes  -= size;
1990
1991                         prp_list++;
1992                 }
1993         }
1994         req->io_req.br_callback = pci_nvme_io_done;
1995         if (is_write)
1996                 err = blockif_write(nvstore->ctx, &req->io_req);
1997         else
1998                 err = blockif_read(nvstore->ctx, &req->io_req);
1999
2000         if (err)
2001                 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2002 out:
2003         return (status);
2004 }
2005
2006 static bool
2007 nvme_opc_write_read(struct pci_nvme_softc *sc,
2008     struct nvme_command *cmd,
2009     struct pci_nvme_blockstore *nvstore,
2010     struct pci_nvme_ioreq *req,
2011     uint16_t *status)
2012 {
2013         uint64_t lba, nblocks, bytes;
2014         size_t offset;
2015         bool is_write = cmd->opc == NVME_OPC_WRITE;
2016         bool pending = false;
2017
2018         lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2019         nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2020         if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2021                 WPRINTF("%s command would exceed LBA range", __func__);
2022                 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2023                 goto out;
2024         }
2025
2026         bytes  = nblocks << nvstore->sectsz_bits;
2027         if (bytes > NVME_MAX_DATA_SIZE) {
2028                 WPRINTF("%s command would exceed MDTS", __func__);
2029                 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2030                 goto out;
2031         }
2032
2033         offset = lba << nvstore->sectsz_bits;
2034
2035         req->bytes = bytes;
2036         req->io_req.br_offset = lba;
2037
2038         /* PRP bits 1:0 must be zero */
2039         cmd->prp1 &= ~0x3UL;
2040         cmd->prp2 &= ~0x3UL;
2041
2042         if (nvstore->type == NVME_STOR_RAM) {
2043                 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2044                     cmd->prp2, offset, bytes, is_write);
2045         } else {
2046                 *status = nvme_write_read_blockif(sc, nvstore, req,
2047                     cmd->prp1, cmd->prp2, offset, bytes, is_write);
2048
2049                 if (*status == NVME_NO_STATUS)
2050                         pending = true;
2051         }
2052 out:
2053         if (!pending)
2054                 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2055
2056         return (pending);
2057 }
2058
2059 static void
2060 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2061 {
2062         struct pci_nvme_ioreq *req = br->br_param;
2063         struct pci_nvme_softc *sc = req->sc;
2064         bool done = true;
2065         uint16_t status;
2066
2067         if (err) {
2068                 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2069         } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2070                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2071         } else {
2072                 struct iovec *iov = req->io_req.br_iov;
2073
2074                 req->prev_gpaddr++;
2075                 iov += req->prev_gpaddr;
2076
2077                 /* The iov_* values already include the sector size */
2078                 req->io_req.br_offset = (off_t)iov->iov_base;
2079                 req->io_req.br_resid = iov->iov_len;
2080                 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2081                         pci_nvme_status_genc(&status,
2082                             NVME_SC_INTERNAL_DEVICE_ERROR);
2083                 } else
2084                         done = false;
2085         }
2086
2087         if (done) {
2088                 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2089                     req->cid, 0, status);
2090                 pci_nvme_release_ioreq(sc, req);
2091         }
2092 }
2093
2094 static bool
2095 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2096     struct nvme_command *cmd,
2097     struct pci_nvme_blockstore *nvstore,
2098     struct pci_nvme_ioreq *req,
2099     uint16_t *status)
2100 {
2101         struct nvme_dsm_range *range;
2102         uint32_t nr, r, non_zero, dr;
2103         int err;
2104         bool pending = false;
2105
2106         if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2107                 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2108                 goto out;
2109         }
2110
2111         nr = cmd->cdw10 & 0xff;
2112
2113         /* copy locally because a range entry could straddle PRPs */
2114         range = calloc(1, NVME_MAX_DSM_TRIM);
2115         if (range == NULL) {
2116                 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2117                 goto out;
2118         }
2119         nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2120             (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2121
2122         /* Check for invalid ranges and the number of non-zero lengths */
2123         non_zero = 0;
2124         for (r = 0; r <= nr; r++) {
2125                 if (pci_nvme_out_of_range(nvstore,
2126                     range[r].starting_lba, range[r].length)) {
2127                         pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2128                         goto out;
2129                 }
2130                 if (range[r].length != 0)
2131                         non_zero++;
2132         }
2133
2134         if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2135                 size_t offset, bytes;
2136                 int sectsz_bits = sc->nvstore.sectsz_bits;
2137
2138                 /*
2139                  * DSM calls are advisory only, and compliant controllers
2140                  * may choose to take no actions (i.e. return Success).
2141                  */
2142                 if (!nvstore->deallocate) {
2143                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2144                         goto out;
2145                 }
2146
2147                 /* If all ranges have a zero length, return Success */
2148                 if (non_zero == 0) {
2149                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2150                         goto out;
2151                 }
2152
2153                 if (req == NULL) {
2154                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2155                         goto out;
2156                 }
2157
2158                 offset = range[0].starting_lba << sectsz_bits;
2159                 bytes = range[0].length << sectsz_bits;
2160
2161                 /*
2162                  * If the request is for more than a single range, store
2163                  * the ranges in the br_iov. Optimize for the common case
2164                  * of a single range.
2165                  *
2166                  * Note that NVMe Number of Ranges is a zero based value
2167                  */
2168                 req->io_req.br_iovcnt = 0;
2169                 req->io_req.br_offset = offset;
2170                 req->io_req.br_resid = bytes;
2171
2172                 if (nr == 0) {
2173                         req->io_req.br_callback = pci_nvme_io_done;
2174                 } else {
2175                         struct iovec *iov = req->io_req.br_iov;
2176
2177                         for (r = 0, dr = 0; r <= nr; r++) {
2178                                 offset = range[r].starting_lba << sectsz_bits;
2179                                 bytes = range[r].length << sectsz_bits;
2180                                 if (bytes == 0)
2181                                         continue;
2182
2183                                 if ((nvstore->size - offset) < bytes) {
2184                                         pci_nvme_status_genc(status,
2185                                             NVME_SC_LBA_OUT_OF_RANGE);
2186                                         goto out;
2187                                 }
2188                                 iov[dr].iov_base = (void *)offset;
2189                                 iov[dr].iov_len = bytes;
2190                                 dr++;
2191                         }
2192                         req->io_req.br_callback = pci_nvme_dealloc_sm;
2193
2194                         /*
2195                          * Use prev_gpaddr to track the current entry and
2196                          * prev_size to track the number of entries
2197                          */
2198                         req->prev_gpaddr = 0;
2199                         req->prev_size = dr;
2200                 }
2201
2202                 err = blockif_delete(nvstore->ctx, &req->io_req);
2203                 if (err)
2204                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2205                 else
2206                         pending = true;
2207         }
2208 out:
2209         free(range);
2210         return (pending);
2211 }
2212
2213 static void
2214 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2215 {
2216         struct nvme_submission_queue *sq;
2217         uint16_t status;
2218         uint16_t sqhead;
2219
2220         /* handle all submissions up to sq->tail index */
2221         sq = &sc->submit_queues[idx];
2222
2223         pthread_mutex_lock(&sq->mtx);
2224
2225         sqhead = sq->head;
2226         DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2227                  idx, sqhead, sq->tail, sq->qbase);
2228
2229         while (sqhead != atomic_load_acq_short(&sq->tail)) {
2230                 struct nvme_command *cmd;
2231                 struct pci_nvme_ioreq *req;
2232                 uint32_t nsid;
2233                 bool pending;
2234
2235                 pending = false;
2236                 req = NULL;
2237                 status = 0;
2238
2239                 cmd = &sq->qbase[sqhead];
2240                 sqhead = (sqhead + 1) % sq->size;
2241
2242                 nsid = le32toh(cmd->nsid);
2243                 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2244                         pci_nvme_status_genc(&status,
2245                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2246                         status |=
2247                             NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2248                         goto complete;
2249                 }
2250
2251                 req = pci_nvme_get_ioreq(sc);
2252                 if (req == NULL) {
2253                         pci_nvme_status_genc(&status,
2254                             NVME_SC_INTERNAL_DEVICE_ERROR);
2255                         WPRINTF("%s: unable to allocate IO req", __func__);
2256                         goto complete;
2257                 }
2258                 req->nvme_sq = sq;
2259                 req->sqid = idx;
2260                 req->opc = cmd->opc;
2261                 req->cid = cmd->cid;
2262                 req->nsid = cmd->nsid;
2263
2264                 switch (cmd->opc) {
2265                 case NVME_OPC_FLUSH:
2266                         pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2267                             req, &status);
2268                         break;
2269                 case NVME_OPC_WRITE:
2270                 case NVME_OPC_READ:
2271                         pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2272                             req, &status);
2273                         break;
2274                 case NVME_OPC_WRITE_ZEROES:
2275                         /* TODO: write zeroes
2276                         WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2277                                 __func__, lba, cmd->cdw12 & 0xFFFF); */
2278                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2279                         break;
2280                 case NVME_OPC_DATASET_MANAGEMENT:
2281                         pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2282                             req, &status);
2283                         break;
2284                 default:
2285                         WPRINTF("%s unhandled io command 0x%x",
2286                             __func__, cmd->opc);
2287                         pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2288                 }
2289 complete:
2290                 if (!pending) {
2291                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2292                             status);
2293                         if (req != NULL)
2294                                 pci_nvme_release_ioreq(sc, req);
2295                 }
2296         }
2297
2298         sq->head = sqhead;
2299
2300         pthread_mutex_unlock(&sq->mtx);
2301 }
2302
2303 static void
2304 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2305         uint64_t idx, int is_sq, uint64_t value)
2306 {
2307         DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2308                 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2309
2310         if (is_sq) {
2311                 if (idx > sc->num_squeues) {
2312                         WPRINTF("%s queue index %lu overflow from "
2313                                  "guest (max %u)",
2314                                  __func__, idx, sc->num_squeues);
2315                         return;
2316                 }
2317
2318                 atomic_store_short(&sc->submit_queues[idx].tail,
2319                                    (uint16_t)value);
2320
2321                 if (idx == 0) {
2322                         pci_nvme_handle_admin_cmd(sc, value);
2323                 } else {
2324                         /* submission queue; handle new entries in SQ */
2325                         if (idx > sc->num_squeues) {
2326                                 WPRINTF("%s SQ index %lu overflow from "
2327                                          "guest (max %u)",
2328                                          __func__, idx, sc->num_squeues);
2329                                 return;
2330                         }
2331                         pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2332                 }
2333         } else {
2334                 if (idx > sc->num_cqueues) {
2335                         WPRINTF("%s queue index %lu overflow from "
2336                                  "guest (max %u)",
2337                                  __func__, idx, sc->num_cqueues);
2338                         return;
2339                 }
2340
2341                 atomic_store_short(&sc->compl_queues[idx].head,
2342                                 (uint16_t)value);
2343         }
2344 }
2345
2346 static void
2347 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2348 {
2349         const char *s = iswrite ? "WRITE" : "READ";
2350
2351         switch (offset) {
2352         case NVME_CR_CAP_LOW:
2353                 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2354                 break;
2355         case NVME_CR_CAP_HI:
2356                 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2357                 break;
2358         case NVME_CR_VS:
2359                 DPRINTF("%s %s NVME_CR_VS", func, s);
2360                 break;
2361         case NVME_CR_INTMS:
2362                 DPRINTF("%s %s NVME_CR_INTMS", func, s);
2363                 break;
2364         case NVME_CR_INTMC:
2365                 DPRINTF("%s %s NVME_CR_INTMC", func, s);
2366                 break;
2367         case NVME_CR_CC:
2368                 DPRINTF("%s %s NVME_CR_CC", func, s);
2369                 break;
2370         case NVME_CR_CSTS:
2371                 DPRINTF("%s %s NVME_CR_CSTS", func, s);
2372                 break;
2373         case NVME_CR_NSSR:
2374                 DPRINTF("%s %s NVME_CR_NSSR", func, s);
2375                 break;
2376         case NVME_CR_AQA:
2377                 DPRINTF("%s %s NVME_CR_AQA", func, s);
2378                 break;
2379         case NVME_CR_ASQ_LOW:
2380                 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2381                 break;
2382         case NVME_CR_ASQ_HI:
2383                 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2384                 break;
2385         case NVME_CR_ACQ_LOW:
2386                 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2387                 break;
2388         case NVME_CR_ACQ_HI:
2389                 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2390                 break;
2391         default:
2392                 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2393         }
2394
2395 }
2396
2397 static void
2398 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2399         uint64_t offset, int size, uint64_t value)
2400 {
2401         uint32_t ccreg;
2402
2403         if (offset >= NVME_DOORBELL_OFFSET) {
2404                 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2405                 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2406                 int is_sq = (belloffset % 8) < 4;
2407
2408                 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2409                         WPRINTF("guest attempted an overflow write offset "
2410                                  "0x%lx, val 0x%lx in %s",
2411                                  offset, value, __func__);
2412                         return;
2413                 }
2414
2415                 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2416                 return;
2417         }
2418
2419         DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2420                 offset, size, value);
2421
2422         if (size != 4) {
2423                 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2424                          "val 0x%lx) to bar0 in %s",
2425                          size, offset, value, __func__);
2426                 /* TODO: shutdown device */
2427                 return;
2428         }
2429
2430         pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2431
2432         pthread_mutex_lock(&sc->mtx);
2433
2434         switch (offset) {
2435         case NVME_CR_CAP_LOW:
2436         case NVME_CR_CAP_HI:
2437                 /* readonly */
2438                 break;
2439         case NVME_CR_VS:
2440                 /* readonly */
2441                 break;
2442         case NVME_CR_INTMS:
2443                 /* MSI-X, so ignore */
2444                 break;
2445         case NVME_CR_INTMC:
2446                 /* MSI-X, so ignore */
2447                 break;
2448         case NVME_CR_CC:
2449                 ccreg = (uint32_t)value;
2450
2451                 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2452                          "iocqes %u",
2453                         __func__,
2454                          NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2455                          NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2456                          NVME_CC_GET_IOCQES(ccreg));
2457
2458                 if (NVME_CC_GET_SHN(ccreg)) {
2459                         /* perform shutdown - flush out data to backend */
2460                         sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2461                             NVME_CSTS_REG_SHST_SHIFT);
2462                         sc->regs.csts |= NVME_SHST_COMPLETE <<
2463                             NVME_CSTS_REG_SHST_SHIFT;
2464                 }
2465                 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2466                         if (NVME_CC_GET_EN(ccreg) == 0)
2467                                 /* transition 1-> causes controller reset */
2468                                 pci_nvme_reset_locked(sc);
2469                         else
2470                                 pci_nvme_init_controller(ctx, sc);
2471                 }
2472
2473                 /* Insert the iocqes, iosqes and en bits from the write */
2474                 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2475                 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2476                 if (NVME_CC_GET_EN(ccreg) == 0) {
2477                         /* Insert the ams, mps and css bit fields */
2478                         sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2479                         sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2480                         sc->regs.csts &= ~NVME_CSTS_RDY;
2481                 } else if (sc->pending_ios == 0) {
2482                         sc->regs.csts |= NVME_CSTS_RDY;
2483                 }
2484                 break;
2485         case NVME_CR_CSTS:
2486                 break;
2487         case NVME_CR_NSSR:
2488                 /* ignore writes; don't support subsystem reset */
2489                 break;
2490         case NVME_CR_AQA:
2491                 sc->regs.aqa = (uint32_t)value;
2492                 break;
2493         case NVME_CR_ASQ_LOW:
2494                 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2495                                (0xFFFFF000 & value);
2496                 break;
2497         case NVME_CR_ASQ_HI:
2498                 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2499                                (value << 32);
2500                 break;
2501         case NVME_CR_ACQ_LOW:
2502                 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2503                                (0xFFFFF000 & value);
2504                 break;
2505         case NVME_CR_ACQ_HI:
2506                 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2507                                (value << 32);
2508                 break;
2509         default:
2510                 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2511                          __func__, offset, value, size);
2512         }
2513         pthread_mutex_unlock(&sc->mtx);
2514 }
2515
2516 static void
2517 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2518                 int baridx, uint64_t offset, int size, uint64_t value)
2519 {
2520         struct pci_nvme_softc* sc = pi->pi_arg;
2521
2522         if (baridx == pci_msix_table_bar(pi) ||
2523             baridx == pci_msix_pba_bar(pi)) {
2524                 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2525                          " value 0x%lx", baridx, offset, size, value);
2526
2527                 pci_emul_msix_twrite(pi, offset, size, value);
2528                 return;
2529         }
2530
2531         switch (baridx) {
2532         case 0:
2533                 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2534                 break;
2535
2536         default:
2537                 DPRINTF("%s unknown baridx %d, val 0x%lx",
2538                          __func__, baridx, value);
2539         }
2540 }
2541
2542 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2543         uint64_t offset, int size)
2544 {
2545         uint64_t value;
2546
2547         pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2548
2549         if (offset < NVME_DOORBELL_OFFSET) {
2550                 void *p = &(sc->regs);
2551                 pthread_mutex_lock(&sc->mtx);
2552                 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2553                 pthread_mutex_unlock(&sc->mtx);
2554         } else {
2555                 value = 0;
2556                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2557         }
2558
2559         switch (size) {
2560         case 1:
2561                 value &= 0xFF;
2562                 break;
2563         case 2:
2564                 value &= 0xFFFF;
2565                 break;
2566         case 4:
2567                 value &= 0xFFFFFFFF;
2568                 break;
2569         }
2570
2571         DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
2572                  offset, size, (uint32_t)value);
2573
2574         return (value);
2575 }
2576
2577
2578
2579 static uint64_t
2580 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2581     uint64_t offset, int size)
2582 {
2583         struct pci_nvme_softc* sc = pi->pi_arg;
2584
2585         if (baridx == pci_msix_table_bar(pi) ||
2586             baridx == pci_msix_pba_bar(pi)) {
2587                 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2588                         baridx, offset, size);
2589
2590                 return pci_emul_msix_tread(pi, offset, size);
2591         }
2592
2593         switch (baridx) {
2594         case 0:
2595                 return pci_nvme_read_bar_0(sc, offset, size);
2596
2597         default:
2598                 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2599         }
2600
2601         return (0);
2602 }
2603
2604
2605 static int
2606 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2607 {
2608         char bident[sizeof("XX:X:X")];
2609         char    *uopt, *xopts, *config;
2610         uint32_t sectsz;
2611         int optidx;
2612
2613         sc->max_queues = NVME_QUEUES;
2614         sc->max_qentries = NVME_MAX_QENTRIES;
2615         sc->ioslots = NVME_IOSLOTS;
2616         sc->num_squeues = sc->max_queues;
2617         sc->num_cqueues = sc->max_queues;
2618         sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2619         sectsz = 0;
2620
2621         uopt = strdup(opts);
2622         optidx = 0;
2623         snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2624                  "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2625         for (xopts = strtok(uopt, ",");
2626              xopts != NULL;
2627              xopts = strtok(NULL, ",")) {
2628
2629                 if ((config = strchr(xopts, '=')) != NULL)
2630                         *config++ = '\0';
2631
2632                 if (!strcmp("maxq", xopts)) {
2633                         sc->max_queues = atoi(config);
2634                 } else if (!strcmp("qsz", xopts)) {
2635                         sc->max_qentries = atoi(config);
2636                 } else if (!strcmp("ioslots", xopts)) {
2637                         sc->ioslots = atoi(config);
2638                 } else if (!strcmp("sectsz", xopts)) {
2639                         sectsz = atoi(config);
2640                 } else if (!strcmp("ser", xopts)) {
2641                         /*
2642                          * This field indicates the Product Serial Number in
2643                          * 7-bit ASCII, unused bytes should be space characters.
2644                          * Ref: NVMe v1.3c.
2645                          */
2646                         cpywithpad((char *)sc->ctrldata.sn,
2647                                    sizeof(sc->ctrldata.sn), config, ' ');
2648                 } else if (!strcmp("ram", xopts)) {
2649                         uint64_t sz = strtoull(&xopts[4], NULL, 10);
2650
2651                         sc->nvstore.type = NVME_STOR_RAM;
2652                         sc->nvstore.size = sz * 1024 * 1024;
2653                         sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2654                         sc->nvstore.sectsz = 4096;
2655                         sc->nvstore.sectsz_bits = 12;
2656                         if (sc->nvstore.ctx == NULL) {
2657                                 perror("Unable to allocate RAM");
2658                                 free(uopt);
2659                                 return (-1);
2660                         }
2661                 } else if (!strcmp("eui64", xopts)) {
2662                         sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2663                 } else if (!strcmp("dsm", xopts)) {
2664                         if (!strcmp("auto", config))
2665                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2666                         else if (!strcmp("enable", config))
2667                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2668                         else if (!strcmp("disable", config))
2669                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2670                 } else if (optidx == 0) {
2671                         snprintf(bident, sizeof(bident), "%d:%d",
2672                                  sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2673                         sc->nvstore.ctx = blockif_open(xopts, bident);
2674                         if (sc->nvstore.ctx == NULL) {
2675                                 perror("Could not open backing file");
2676                                 free(uopt);
2677                                 return (-1);
2678                         }
2679                         sc->nvstore.type = NVME_STOR_BLOCKIF;
2680                         sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2681                 } else {
2682                         EPRINTLN("Invalid option %s", xopts);
2683                         free(uopt);
2684                         return (-1);
2685                 }
2686
2687                 optidx++;
2688         }
2689         free(uopt);
2690
2691         if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2692                 EPRINTLN("backing store not specified");
2693                 return (-1);
2694         }
2695         if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2696                 sc->nvstore.sectsz = sectsz;
2697         else if (sc->nvstore.type != NVME_STOR_RAM)
2698                 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2699         for (sc->nvstore.sectsz_bits = 9;
2700              (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2701              sc->nvstore.sectsz_bits++);
2702
2703         if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2704                 sc->max_queues = NVME_QUEUES;
2705
2706         if (sc->max_qentries <= 0) {
2707                 EPRINTLN("Invalid qsz option");
2708                 return (-1);
2709         }
2710         if (sc->ioslots <= 0) {
2711                 EPRINTLN("Invalid ioslots option");
2712                 return (-1);
2713         }
2714
2715         return (0);
2716 }
2717
2718 static int
2719 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2720 {
2721         struct pci_nvme_softc *sc;
2722         uint32_t pci_membar_sz;
2723         int     error;
2724
2725         error = 0;
2726
2727         sc = calloc(1, sizeof(struct pci_nvme_softc));
2728         pi->pi_arg = sc;
2729         sc->nsc_pi = pi;
2730
2731         error = pci_nvme_parse_opts(sc, opts);
2732         if (error < 0)
2733                 goto done;
2734         else
2735                 error = 0;
2736
2737         STAILQ_INIT(&sc->ioreqs_free);
2738         sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2739         for (int i = 0; i < sc->ioslots; i++) {
2740                 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2741         }
2742
2743         pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2744         pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2745         pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2746         pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2747         pci_set_cfgdata8(pi, PCIR_PROGIF,
2748                          PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2749
2750         /*
2751          * Allocate size of NVMe registers + doorbell space for all queues.
2752          *
2753          * The specification requires a minimum memory I/O window size of 16K.
2754          * The Windows driver will refuse to start a device with a smaller
2755          * window.
2756          */
2757         pci_membar_sz = sizeof(struct nvme_registers) +
2758             2 * sizeof(uint32_t) * (sc->max_queues + 1);
2759         pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2760
2761         DPRINTF("nvme membar size: %u", pci_membar_sz);
2762
2763         error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2764         if (error) {
2765                 WPRINTF("%s pci alloc mem bar failed", __func__);
2766                 goto done;
2767         }
2768
2769         error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2770         if (error) {
2771                 WPRINTF("%s pci add msixcap failed", __func__);
2772                 goto done;
2773         }
2774
2775         error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2776         if (error) {
2777                 WPRINTF("%s pci add Express capability failed", __func__);
2778                 goto done;
2779         }
2780
2781         pthread_mutex_init(&sc->mtx, NULL);
2782         sem_init(&sc->iosemlock, 0, sc->ioslots);
2783
2784         pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2785         /*
2786          * Controller data depends on Namespace data so initialize Namespace
2787          * data first.
2788          */
2789         pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2790         pci_nvme_init_ctrldata(sc);
2791         pci_nvme_init_logpages(sc);
2792         pci_nvme_init_features(sc);
2793
2794         pci_nvme_aer_init(sc);
2795
2796         pci_nvme_reset(sc);
2797
2798         pci_lintr_request(pi);
2799
2800 done:
2801         return (error);
2802 }
2803
2804
2805 struct pci_devemu pci_de_nvme = {
2806         .pe_emu =       "nvme",
2807         .pe_init =      pci_nvme_init,
2808         .pe_barwrite =  pci_nvme_write,
2809         .pe_barread =   pci_nvme_read
2810 };
2811 PCI_EMUL_SET(pci_de_nvme);