]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/bhyve/pci_nvme.c
bhyve: validate the NVMe LBA start and count
[FreeBSD/FreeBSD.git] / usr.sbin / bhyve / pci_nvme.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov 
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65
66 #include <assert.h>
67 #include <pthread.h>
68 #include <semaphore.h>
69 #include <stdbool.h>
70 #include <stddef.h>
71 #include <stdint.h>
72 #include <stdio.h>
73 #include <stdlib.h>
74 #include <string.h>
75
76 #include <machine/atomic.h>
77 #include <machine/vmm.h>
78 #include <vmmapi.h>
79
80 #include <dev/nvme/nvme.h>
81
82 #include "bhyverun.h"
83 #include "block_if.h"
84 #include "debug.h"
85 #include "pci_emul.h"
86
87
88 static int nvme_debug = 0;
89 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
90 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
91
92 /* defaults; can be overridden */
93 #define NVME_MSIX_BAR           4
94
95 #define NVME_IOSLOTS            8
96
97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
98 #define NVME_MMIO_SPACE_MIN     (1 << 14)
99
100 #define NVME_QUEUES             16
101 #define NVME_MAX_QENTRIES       2048
102 /* Memory Page size Minimum reported in CAP register */
103 #define NVME_MPSMIN             0
104 /* MPSMIN converted to bytes */
105 #define NVME_MPSMIN_BYTES       (1 << (12 + NVME_MPSMIN))
106
107 #define NVME_PRP2_ITEMS         (PAGE_SIZE/sizeof(uint64_t))
108 #define NVME_MDTS               9
109 /* Note the + 1 allows for the initial descriptor to not be page aligned */
110 #define NVME_MAX_IOVEC          ((1 << NVME_MDTS) + 1)
111 #define NVME_MAX_DATA_SIZE      ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
112
113 /* This is a synthetic status code to indicate there is no status */
114 #define NVME_NO_STATUS          0xffff
115 #define NVME_COMPLETION_VALID(c)        ((c).status != NVME_NO_STATUS)
116
117 /* helpers */
118
119 /* Convert a zero-based value into a one-based value */
120 #define ONE_BASED(zero)         ((zero) + 1)
121 /* Convert a one-based value into a zero-based value */
122 #define ZERO_BASED(one)         ((one)  - 1)
123
124 /* Encode number of SQ's and CQ's for Set/Get Features */
125 #define NVME_FEATURE_NUM_QUEUES(sc) \
126         (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
127         (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
128
129 #define NVME_DOORBELL_OFFSET    offsetof(struct nvme_registers, doorbell)
130
131 enum nvme_controller_register_offsets {
132         NVME_CR_CAP_LOW = 0x00,
133         NVME_CR_CAP_HI  = 0x04,
134         NVME_CR_VS      = 0x08,
135         NVME_CR_INTMS   = 0x0c,
136         NVME_CR_INTMC   = 0x10,
137         NVME_CR_CC      = 0x14,
138         NVME_CR_CSTS    = 0x1c,
139         NVME_CR_NSSR    = 0x20,
140         NVME_CR_AQA     = 0x24,
141         NVME_CR_ASQ_LOW = 0x28,
142         NVME_CR_ASQ_HI  = 0x2c,
143         NVME_CR_ACQ_LOW = 0x30,
144         NVME_CR_ACQ_HI  = 0x34,
145 };
146
147 enum nvme_cmd_cdw11 {
148         NVME_CMD_CDW11_PC  = 0x0001,
149         NVME_CMD_CDW11_IEN = 0x0002,
150         NVME_CMD_CDW11_IV  = 0xFFFF0000,
151 };
152
153 enum nvme_copy_dir {
154         NVME_COPY_TO_PRP,
155         NVME_COPY_FROM_PRP,
156 };
157
158 #define NVME_CQ_INTEN   0x01
159 #define NVME_CQ_INTCOAL 0x02
160
161 struct nvme_completion_queue {
162         struct nvme_completion *qbase;
163         pthread_mutex_t mtx;
164         uint32_t        size;
165         uint16_t        tail; /* nvme progress */
166         uint16_t        head; /* guest progress */
167         uint16_t        intr_vec;
168         uint32_t        intr_en;
169 };
170
171 struct nvme_submission_queue {
172         struct nvme_command *qbase;
173         pthread_mutex_t mtx;
174         uint32_t        size;
175         uint16_t        head; /* nvme progress */
176         uint16_t        tail; /* guest progress */
177         uint16_t        cqid; /* completion queue id */
178         int             qpriority;
179 };
180
181 enum nvme_storage_type {
182         NVME_STOR_BLOCKIF = 0,
183         NVME_STOR_RAM = 1,
184 };
185
186 struct pci_nvme_blockstore {
187         enum nvme_storage_type type;
188         void            *ctx;
189         uint64_t        size;
190         uint32_t        sectsz;
191         uint32_t        sectsz_bits;
192         uint64_t        eui64;
193         uint32_t        deallocate:1;
194 };
195
196 /*
197  * Calculate the number of additional page descriptors for guest IO requests
198  * based on the advertised Max Data Transfer (MDTS) and given the number of
199  * default iovec's in a struct blockif_req.
200  *
201  * Note the + 1 allows for the initial descriptor to not be page aligned.
202  */
203 #define MDTS_PAD_SIZE \
204         NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
205         NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
206         0
207
208 struct pci_nvme_ioreq {
209         struct pci_nvme_softc *sc;
210         STAILQ_ENTRY(pci_nvme_ioreq) link;
211         struct nvme_submission_queue *nvme_sq;
212         uint16_t        sqid;
213
214         /* command information */
215         uint16_t        opc;
216         uint16_t        cid;
217         uint32_t        nsid;
218
219         uint64_t        prev_gpaddr;
220         size_t          prev_size;
221         size_t          bytes;
222
223         struct blockif_req io_req;
224
225         struct iovec    iovpadding[MDTS_PAD_SIZE];
226 };
227
228 enum nvme_dsm_type {
229         /* Dataset Management bit in ONCS reflects backing storage capability */
230         NVME_DATASET_MANAGEMENT_AUTO,
231         /* Unconditionally set Dataset Management bit in ONCS */
232         NVME_DATASET_MANAGEMENT_ENABLE,
233         /* Unconditionally clear Dataset Management bit in ONCS */
234         NVME_DATASET_MANAGEMENT_DISABLE,
235 };
236
237 struct pci_nvme_softc;
238 struct nvme_feature_obj;
239
240 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
241     struct nvme_feature_obj *,
242     struct nvme_command *,
243     struct nvme_completion *);
244
245 struct nvme_feature_obj {
246         uint32_t        cdw11;
247         nvme_feature_cb set;
248         nvme_feature_cb get;
249         bool namespace_specific;
250 };
251
252 #define NVME_FID_MAX            (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
253
254 struct pci_nvme_softc {
255         struct pci_devinst *nsc_pi;
256
257         pthread_mutex_t mtx;
258
259         struct nvme_registers regs;
260
261         struct nvme_namespace_data  nsdata;
262         struct nvme_controller_data ctrldata;
263         struct nvme_error_information_entry err_log;
264         struct nvme_health_information_page health_log;
265         struct nvme_firmware_page fw_log;
266
267         struct pci_nvme_blockstore nvstore;
268
269         uint16_t        max_qentries;   /* max entries per queue */
270         uint32_t        max_queues;     /* max number of IO SQ's or CQ's */
271         uint32_t        num_cqueues;
272         uint32_t        num_squeues;
273         bool            num_q_is_set; /* Has host set Number of Queues */
274
275         struct pci_nvme_ioreq *ioreqs;
276         STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
277         uint32_t        pending_ios;
278         uint32_t        ioslots;
279         sem_t           iosemlock;
280
281         /*
282          * Memory mapped Submission and Completion queues
283          * Each array includes both Admin and IO queues
284          */
285         struct nvme_completion_queue *compl_queues;
286         struct nvme_submission_queue *submit_queues;
287
288         struct nvme_feature_obj feat[NVME_FID_MAX];
289
290         enum nvme_dsm_type dataset_management;
291
292         /* Accounting for SMART data */
293         __uint128_t     read_data_units;
294         __uint128_t     write_data_units;
295         __uint128_t     read_commands;
296         __uint128_t     write_commands;
297         uint32_t        read_dunits_remainder;
298         uint32_t        write_dunits_remainder;
299 };
300
301
302 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
303 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
304 static void pci_nvme_io_done(struct blockif_req *, int);
305
306 /* Controller Configuration utils */
307 #define NVME_CC_GET_EN(cc) \
308         ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
309 #define NVME_CC_GET_CSS(cc) \
310         ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
311 #define NVME_CC_GET_SHN(cc) \
312         ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
313 #define NVME_CC_GET_IOSQES(cc) \
314         ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
315 #define NVME_CC_GET_IOCQES(cc) \
316         ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
317
318 #define NVME_CC_WRITE_MASK \
319         ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
320          (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
321          (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
322
323 #define NVME_CC_NEN_WRITE_MASK \
324         ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
325          (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
326          (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
327
328 /* Controller Status utils */
329 #define NVME_CSTS_GET_RDY(sts) \
330         ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
331
332 #define NVME_CSTS_RDY   (1 << NVME_CSTS_REG_RDY_SHIFT)
333
334 /* Completion Queue status word utils */
335 #define NVME_STATUS_P   (1 << NVME_STATUS_P_SHIFT)
336 #define NVME_STATUS_MASK \
337         ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
338          (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
339
340 #define NVME_ONCS_DSM   (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
341         NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
342
343 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
344     struct nvme_feature_obj *,
345     struct nvme_command *,
346     struct nvme_completion *);
347 static void nvme_feature_num_queues(struct pci_nvme_softc *,
348     struct nvme_feature_obj *,
349     struct nvme_command *,
350     struct nvme_completion *);
351
352 static __inline void
353 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
354 {
355         size_t len;
356
357         len = strnlen(src, dst_size);
358         memset(dst, pad, dst_size);
359         memcpy(dst, src, len);
360 }
361
362 static __inline void
363 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
364 {
365
366         *status &= ~NVME_STATUS_MASK;
367         *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
368                 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
369 }
370
371 static __inline void
372 pci_nvme_status_genc(uint16_t *status, uint16_t code)
373 {
374
375         pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
376 }
377
378 /*
379  * Initialize the requested number or IO Submission and Completion Queues.
380  * Admin queues are allocated implicitly.
381  */
382 static void
383 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
384 {
385         uint32_t i;
386
387         /*
388          * Allocate and initialize the Submission Queues
389          */
390         if (nsq > NVME_QUEUES) {
391                 WPRINTF("%s: clamping number of SQ from %u to %u",
392                                         __func__, nsq, NVME_QUEUES);
393                 nsq = NVME_QUEUES;
394         }
395
396         sc->num_squeues = nsq;
397
398         sc->submit_queues = calloc(sc->num_squeues + 1,
399                                 sizeof(struct nvme_submission_queue));
400         if (sc->submit_queues == NULL) {
401                 WPRINTF("%s: SQ allocation failed", __func__);
402                 sc->num_squeues = 0;
403         } else {
404                 struct nvme_submission_queue *sq = sc->submit_queues;
405
406                 for (i = 0; i < sc->num_squeues; i++)
407                         pthread_mutex_init(&sq[i].mtx, NULL);
408         }
409
410         /*
411          * Allocate and initialize the Completion Queues
412          */
413         if (ncq > NVME_QUEUES) {
414                 WPRINTF("%s: clamping number of CQ from %u to %u",
415                                         __func__, ncq, NVME_QUEUES);
416                 ncq = NVME_QUEUES;
417         }
418
419         sc->num_cqueues = ncq;
420
421         sc->compl_queues = calloc(sc->num_cqueues + 1,
422                                 sizeof(struct nvme_completion_queue));
423         if (sc->compl_queues == NULL) {
424                 WPRINTF("%s: CQ allocation failed", __func__);
425                 sc->num_cqueues = 0;
426         } else {
427                 struct nvme_completion_queue *cq = sc->compl_queues;
428
429                 for (i = 0; i < sc->num_cqueues; i++)
430                         pthread_mutex_init(&cq[i].mtx, NULL);
431         }
432 }
433
434 static void
435 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
436 {
437         struct nvme_controller_data *cd = &sc->ctrldata;
438
439         cd->vid = 0xFB5D;
440         cd->ssvid = 0x0000;
441
442         cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
443         cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
444
445         /* Num of submission commands that we can handle at a time (2^rab) */
446         cd->rab   = 4;
447
448         /* FreeBSD OUI */
449         cd->ieee[0] = 0x58;
450         cd->ieee[1] = 0x9c;
451         cd->ieee[2] = 0xfc;
452
453         cd->mic = 0;
454
455         cd->mdts = NVME_MDTS;   /* max data transfer size (2^mdts * CAP.MPSMIN) */
456
457         cd->ver = 0x00010300;
458
459         cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
460         cd->acl = 2;
461         cd->aerl = 4;
462
463         cd->lpa = 0;    /* TODO: support some simple things like SMART */
464         cd->elpe = 0;   /* max error log page entries */
465         cd->npss = 1;   /* number of power states support */
466
467         /* Warning Composite Temperature Threshold */
468         cd->wctemp = 0x0157;
469
470         cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
471             (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
472         cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
473             (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
474         cd->nn = 1;     /* number of namespaces */
475
476         cd->oncs = 0;
477         switch (sc->dataset_management) {
478         case NVME_DATASET_MANAGEMENT_AUTO:
479                 if (sc->nvstore.deallocate)
480                         cd->oncs |= NVME_ONCS_DSM;
481                 break;
482         case NVME_DATASET_MANAGEMENT_ENABLE:
483                 cd->oncs |= NVME_ONCS_DSM;
484                 break;
485         default:
486                 break;
487         }
488
489         cd->fna = 0x03;
490
491         cd->power_state[0].mp = 10;
492 }
493
494 /*
495  * Calculate the CRC-16 of the given buffer
496  * See copyright attribution at top of file
497  */
498 static uint16_t
499 crc16(uint16_t crc, const void *buffer, unsigned int len)
500 {
501         const unsigned char *cp = buffer;
502         /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
503         static uint16_t const crc16_table[256] = {
504                 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
505                 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
506                 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
507                 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
508                 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
509                 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
510                 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
511                 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
512                 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
513                 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
514                 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
515                 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
516                 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
517                 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
518                 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
519                 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
520                 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
521                 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
522                 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
523                 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
524                 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
525                 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
526                 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
527                 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
528                 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
529                 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
530                 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
531                 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
532                 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
533                 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
534                 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
535                 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
536         };
537
538         while (len--)
539                 crc = (((crc >> 8) & 0xffU) ^
540                     crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
541         return crc;
542 }
543
544 static void
545 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
546     struct nvme_namespace_data *nd, uint32_t nsid,
547     struct pci_nvme_blockstore *nvstore)
548 {
549
550         /* Get capacity and block size information from backing store */
551         nd->nsze = nvstore->size / nvstore->sectsz;
552         nd->ncap = nd->nsze;
553         nd->nuse = nd->nsze;
554
555         if (nvstore->type == NVME_STOR_BLOCKIF)
556                 nvstore->deallocate = blockif_candelete(nvstore->ctx);
557
558         nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
559         nd->flbas = 0;
560
561         /* Create an EUI-64 if user did not provide one */
562         if (nvstore->eui64 == 0) {
563                 char *data = NULL;
564                 uint64_t eui64 = nvstore->eui64;
565
566                 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
567                     sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
568
569                 if (data != NULL) {
570                         eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
571                         free(data);
572                 }
573                 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
574         }
575         be64enc(nd->eui64, nvstore->eui64);
576
577         /* LBA data-sz = 2^lbads */
578         nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
579 }
580
581 static void
582 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
583 {
584
585         memset(&sc->err_log, 0, sizeof(sc->err_log));
586         memset(&sc->health_log, 0, sizeof(sc->health_log));
587         memset(&sc->fw_log, 0, sizeof(sc->fw_log));
588
589         /* Set read/write remainder to round up according to spec */
590         sc->read_dunits_remainder = 999;
591         sc->write_dunits_remainder = 999;
592 }
593
594 static void
595 pci_nvme_init_features(struct pci_nvme_softc *sc)
596 {
597
598         sc->feat[0].set = nvme_feature_invalid_cb;
599         sc->feat[0].get = nvme_feature_invalid_cb;
600
601         sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
602         sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
603         sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
604 }
605
606 static void
607 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
608 {
609         uint32_t i;
610
611         DPRINTF("%s", __func__);
612
613         sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
614             (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
615             (60 << NVME_CAP_LO_REG_TO_SHIFT);
616
617         sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
618
619         sc->regs.vs = 0x00010300;       /* NVMe v1.3 */
620
621         sc->regs.cc = 0;
622         sc->regs.csts = 0;
623
624         assert(sc->submit_queues != NULL);
625
626         for (i = 0; i < sc->num_squeues + 1; i++) {
627                 sc->submit_queues[i].qbase = NULL;
628                 sc->submit_queues[i].size = 0;
629                 sc->submit_queues[i].cqid = 0;
630                 sc->submit_queues[i].tail = 0;
631                 sc->submit_queues[i].head = 0;
632         }
633
634         assert(sc->compl_queues != NULL);
635
636         for (i = 0; i < sc->num_cqueues + 1; i++) {
637                 sc->compl_queues[i].qbase = NULL;
638                 sc->compl_queues[i].size = 0;
639                 sc->compl_queues[i].tail = 0;
640                 sc->compl_queues[i].head = 0;
641         }
642
643         sc->num_q_is_set = false;
644 }
645
646 static void
647 pci_nvme_reset(struct pci_nvme_softc *sc)
648 {
649         pthread_mutex_lock(&sc->mtx);
650         pci_nvme_reset_locked(sc);
651         pthread_mutex_unlock(&sc->mtx);
652 }
653
654 static void
655 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
656 {
657         uint16_t acqs, asqs;
658
659         DPRINTF("%s", __func__);
660
661         asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
662         sc->submit_queues[0].size = asqs;
663         sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
664                     sizeof(struct nvme_command) * asqs);
665
666         DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
667                 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
668
669         acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 
670             NVME_AQA_REG_ACQS_MASK) + 1;
671         sc->compl_queues[0].size = acqs;
672         sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
673                  sizeof(struct nvme_completion) * acqs);
674         sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
675
676         DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
677                 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
678 }
679
680 static int
681 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
682         size_t len, enum nvme_copy_dir dir)
683 {
684         uint8_t *p;
685         size_t bytes;
686
687         if (len > (8 * 1024)) {
688                 return (-1);
689         }
690
691         /* Copy from the start of prp1 to the end of the physical page */
692         bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
693         bytes = MIN(bytes, len);
694
695         p = vm_map_gpa(ctx, prp1, bytes);
696         if (p == NULL) {
697                 return (-1);
698         }
699
700         if (dir == NVME_COPY_TO_PRP)
701                 memcpy(p, b, bytes);
702         else
703                 memcpy(b, p, bytes);
704
705         b += bytes;
706
707         len -= bytes;
708         if (len == 0) {
709                 return (0);
710         }
711
712         len = MIN(len, PAGE_SIZE);
713
714         p = vm_map_gpa(ctx, prp2, len);
715         if (p == NULL) {
716                 return (-1);
717         }
718
719         if (dir == NVME_COPY_TO_PRP)
720                 memcpy(p, b, len);
721         else
722                 memcpy(b, p, len);
723
724         return (0);
725 }
726
727 /*
728  * Write a Completion Queue Entry update
729  *
730  * Write the completion and update the doorbell value
731  */
732 static void
733 pci_nvme_cq_update(struct pci_nvme_softc *sc,
734                 struct nvme_completion_queue *cq,
735                 uint32_t cdw0,
736                 uint16_t cid,
737                 uint16_t sqid,
738                 uint16_t status)
739 {
740         struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
741         struct nvme_completion *cqe;
742
743         assert(cq->qbase != NULL);
744
745         pthread_mutex_lock(&cq->mtx);
746
747         cqe = &cq->qbase[cq->tail];
748
749         /* Flip the phase bit */
750         status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
751
752         cqe->cdw0 = cdw0;
753         cqe->sqhd = sq->head;
754         cqe->sqid = sqid;
755         cqe->cid = cid;
756         cqe->status = status;
757
758         cq->tail++;
759         if (cq->tail >= cq->size) {
760                 cq->tail = 0;
761         }
762
763         pthread_mutex_unlock(&cq->mtx);
764 }
765
766 static int
767 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
768         struct nvme_completion* compl)
769 {
770         uint16_t qid = command->cdw10 & 0xffff;
771
772         DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
773         if (qid == 0 || qid > sc->num_squeues ||
774             (sc->submit_queues[qid].qbase == NULL)) {
775                 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
776                         __func__, qid, sc->num_squeues);
777                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
778                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
779                 return (1);
780         }
781
782         sc->submit_queues[qid].qbase = NULL;
783         sc->submit_queues[qid].cqid = 0;
784         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
785         return (1);
786 }
787
788 static int
789 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
790         struct nvme_completion* compl)
791 {
792         if (command->cdw11 & NVME_CMD_CDW11_PC) {
793                 uint16_t qid = command->cdw10 & 0xffff;
794                 struct nvme_submission_queue *nsq;
795
796                 if ((qid == 0) || (qid > sc->num_squeues) ||
797                     (sc->submit_queues[qid].qbase != NULL)) {
798                         WPRINTF("%s queue index %u > num_squeues %u",
799                                 __func__, qid, sc->num_squeues);
800                         pci_nvme_status_tc(&compl->status,
801                             NVME_SCT_COMMAND_SPECIFIC,
802                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
803                         return (1);
804                 }
805
806                 nsq = &sc->submit_queues[qid];
807                 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
808                 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
809                 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
810                         /*
811                          * Queues must specify at least two entries
812                          * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
813                          * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
814                          */
815                         pci_nvme_status_tc(&compl->status,
816                             NVME_SCT_COMMAND_SPECIFIC,
817                             NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
818                         return (1);
819                 }
820
821                 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
822                 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
823                         pci_nvme_status_tc(&compl->status,
824                             NVME_SCT_COMMAND_SPECIFIC,
825                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
826                         return (1);
827                 }
828
829                 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
830                         pci_nvme_status_tc(&compl->status,
831                             NVME_SCT_COMMAND_SPECIFIC,
832                             NVME_SC_COMPLETION_QUEUE_INVALID);
833                         return (1);
834                 }
835
836                 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
837
838                 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
839                               sizeof(struct nvme_command) * (size_t)nsq->size);
840
841                 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
842                         qid, nsq->size, nsq->qbase, nsq->cqid);
843
844                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
845
846                 DPRINTF("%s completed creating IOSQ qid %u",
847                          __func__, qid);
848         } else {
849                 /* 
850                  * Guest sent non-cont submission queue request.
851                  * This setting is unsupported by this emulation.
852                  */
853                 WPRINTF("%s unsupported non-contig (list-based) "
854                          "create i/o submission queue", __func__);
855
856                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
857         }
858         return (1);
859 }
860
861 static int
862 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
863         struct nvme_completion* compl)
864 {
865         uint16_t qid = command->cdw10 & 0xffff;
866         uint16_t sqid;
867
868         DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
869         if (qid == 0 || qid > sc->num_cqueues ||
870             (sc->compl_queues[qid].qbase == NULL)) {
871                 WPRINTF("%s queue index %u / num_cqueues %u",
872                         __func__, qid, sc->num_cqueues);
873                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
874                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
875                 return (1);
876         }
877
878         /* Deleting an Active CQ is an error */
879         for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
880                 if (sc->submit_queues[sqid].cqid == qid) {
881                         pci_nvme_status_tc(&compl->status,
882                             NVME_SCT_COMMAND_SPECIFIC,
883                             NVME_SC_INVALID_QUEUE_DELETION);
884                         return (1);
885                 }
886
887         sc->compl_queues[qid].qbase = NULL;
888         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
889         return (1);
890 }
891
892 static int
893 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
894         struct nvme_completion* compl)
895 {
896         struct nvme_completion_queue *ncq;
897         uint16_t qid = command->cdw10 & 0xffff;
898
899         /* Only support Physically Contiguous queues */
900         if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
901                 WPRINTF("%s unsupported non-contig (list-based) "
902                          "create i/o completion queue",
903                          __func__);
904
905                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
906                 return (1);
907         }
908
909         if ((qid == 0) || (qid > sc->num_cqueues) ||
910             (sc->compl_queues[qid].qbase != NULL)) {
911                 WPRINTF("%s queue index %u > num_cqueues %u",
912                         __func__, qid, sc->num_cqueues);
913                 pci_nvme_status_tc(&compl->status,
914                     NVME_SCT_COMMAND_SPECIFIC,
915                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
916                 return (1);
917         }
918
919         ncq = &sc->compl_queues[qid];
920         ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
921         ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
922         if (ncq->intr_vec > (sc->max_queues + 1)) {
923                 pci_nvme_status_tc(&compl->status,
924                     NVME_SCT_COMMAND_SPECIFIC,
925                     NVME_SC_INVALID_INTERRUPT_VECTOR);
926                 return (1);
927         }
928
929         ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
930         if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
931                 /*
932                  * Queues must specify at least two entries
933                  * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
934                  * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
935                  */
936                 pci_nvme_status_tc(&compl->status,
937                     NVME_SCT_COMMAND_SPECIFIC,
938                     NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
939                 return (1);
940         }
941         ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
942                      command->prp1,
943                      sizeof(struct nvme_command) * (size_t)ncq->size);
944
945         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
946
947
948         return (1);
949 }
950
951 static int
952 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
953         struct nvme_completion* compl)
954 {
955         uint32_t logsize;
956         uint8_t logpage = command->cdw10 & 0xFF;
957
958         DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
959
960         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
961
962         /*
963          * Command specifies the number of dwords to return in fields NUMDU
964          * and NUMDL. This is a zero-based value.
965          */
966         logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
967         logsize *= sizeof(uint32_t);
968
969         switch (logpage) {
970         case NVME_LOG_ERROR:
971                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
972                     command->prp2, (uint8_t *)&sc->err_log,
973                     MIN(logsize, sizeof(sc->err_log)),
974                     NVME_COPY_TO_PRP);
975                 break;
976         case NVME_LOG_HEALTH_INFORMATION:
977                 pthread_mutex_lock(&sc->mtx);
978                 memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
979                     sizeof(sc->health_log.data_units_read));
980                 memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
981                     sizeof(sc->health_log.data_units_written));
982                 memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
983                     sizeof(sc->health_log.host_read_commands));
984                 memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
985                     sizeof(sc->health_log.host_write_commands));
986                 pthread_mutex_unlock(&sc->mtx);
987
988                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
989                     command->prp2, (uint8_t *)&sc->health_log,
990                     MIN(logsize, sizeof(sc->health_log)),
991                     NVME_COPY_TO_PRP);
992                 break;
993         case NVME_LOG_FIRMWARE_SLOT:
994                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
995                     command->prp2, (uint8_t *)&sc->fw_log,
996                     MIN(logsize, sizeof(sc->fw_log)),
997                     NVME_COPY_TO_PRP);
998                 break;
999         default:
1000                 DPRINTF("%s get log page %x command not supported",
1001                         __func__, logpage);
1002
1003                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1004                     NVME_SC_INVALID_LOG_PAGE);
1005         }
1006
1007         return (1);
1008 }
1009
1010 static int
1011 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1012         struct nvme_completion* compl)
1013 {
1014         void *dest;
1015         uint16_t status;
1016
1017         DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1018                 command->cdw10 & 0xFF, command->nsid);
1019
1020         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1021
1022         switch (command->cdw10 & 0xFF) {
1023         case 0x00: /* return Identify Namespace data structure */
1024                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1025                     command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1026                     NVME_COPY_TO_PRP);
1027                 break;
1028         case 0x01: /* return Identify Controller data structure */
1029                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1030                     command->prp2, (uint8_t *)&sc->ctrldata,
1031                     sizeof(sc->ctrldata),
1032                     NVME_COPY_TO_PRP);
1033                 break;
1034         case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1035                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1036                                   sizeof(uint32_t) * 1024);
1037                 ((uint32_t *)dest)[0] = 1;
1038                 ((uint32_t *)dest)[1] = 0;
1039                 break;
1040         case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1041                 if (command->nsid != 1) {
1042                         pci_nvme_status_genc(&status,
1043                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1044                         break;
1045                 }
1046                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1047                                   sizeof(uint32_t) * 1024);
1048                 /* All bytes after the descriptor shall be zero */
1049                 bzero(dest, sizeof(uint32_t) * 1024);
1050
1051                 /* Return NIDT=1 (i.e. EUI64) descriptor */
1052                 ((uint8_t *)dest)[0] = 1;
1053                 ((uint8_t *)dest)[1] = sizeof(uint64_t);
1054                 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1055                 break;
1056         default:
1057                 DPRINTF("%s unsupported identify command requested 0x%x",
1058                          __func__, command->cdw10 & 0xFF);
1059                 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1060                 break;
1061         }
1062
1063         compl->status = status;
1064         return (1);
1065 }
1066
1067 static const char *
1068 nvme_fid_to_name(uint8_t fid)
1069 {
1070         const char *name;
1071
1072         switch (fid) {
1073         case NVME_FEAT_ARBITRATION:
1074                 name = "Arbitration";
1075                 break;
1076         case NVME_FEAT_POWER_MANAGEMENT:
1077                 name = "Power Management";
1078                 break;
1079         case NVME_FEAT_LBA_RANGE_TYPE:
1080                 name = "LBA Range Type";
1081                 break;
1082         case NVME_FEAT_TEMPERATURE_THRESHOLD:
1083                 name = "Temperature Threshold";
1084                 break;
1085         case NVME_FEAT_ERROR_RECOVERY:
1086                 name = "Error Recovery";
1087                 break;
1088         case NVME_FEAT_VOLATILE_WRITE_CACHE:
1089                 name = "Volatile Write Cache";
1090                 break;
1091         case NVME_FEAT_NUMBER_OF_QUEUES:
1092                 name = "Number of Queues";
1093                 break;
1094         case NVME_FEAT_INTERRUPT_COALESCING:
1095                 name = "Interrupt Coalescing";
1096                 break;
1097         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1098                 name = "Interrupt Vector Configuration";
1099                 break;
1100         case NVME_FEAT_WRITE_ATOMICITY:
1101                 name = "Write Atomicity Normal";
1102                 break;
1103         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1104                 name = "Asynchronous Event Configuration";
1105                 break;
1106         case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1107                 name = "Autonomous Power State Transition";
1108                 break;
1109         case NVME_FEAT_HOST_MEMORY_BUFFER:
1110                 name = "Host Memory Buffer";
1111                 break;
1112         case NVME_FEAT_TIMESTAMP:
1113                 name = "Timestamp";
1114                 break;
1115         case NVME_FEAT_KEEP_ALIVE_TIMER:
1116                 name = "Keep Alive Timer";
1117                 break;
1118         case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1119                 name = "Host Controlled Thermal Management";
1120                 break;
1121         case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1122                 name = "Non-Operation Power State Config";
1123                 break;
1124         case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1125                 name = "Read Recovery Level Config";
1126                 break;
1127         case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1128                 name = "Predictable Latency Mode Config";
1129                 break;
1130         case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1131                 name = "Predictable Latency Mode Window";
1132                 break;
1133         case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1134                 name = "LBA Status Information Report Interval";
1135                 break;
1136         case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1137                 name = "Host Behavior Support";
1138                 break;
1139         case NVME_FEAT_SANITIZE_CONFIG:
1140                 name = "Sanitize Config";
1141                 break;
1142         case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1143                 name = "Endurance Group Event Configuration";
1144                 break;
1145         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1146                 name = "Software Progress Marker";
1147                 break;
1148         case NVME_FEAT_HOST_IDENTIFIER:
1149                 name = "Host Identifier";
1150                 break;
1151         case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1152                 name = "Reservation Notification Mask";
1153                 break;
1154         case NVME_FEAT_RESERVATION_PERSISTENCE:
1155                 name = "Reservation Persistence";
1156                 break;
1157         case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1158                 name = "Namespace Write Protection Config";
1159                 break;
1160         default:
1161                 name = "Unknown";
1162                 break;
1163         }
1164
1165         return (name);
1166 }
1167
1168 static void
1169 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1170     struct nvme_feature_obj *feat,
1171     struct nvme_command *command,
1172     struct nvme_completion *compl)
1173 {
1174
1175         pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1176 }
1177
1178 static void
1179 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1180     struct nvme_feature_obj *feat,
1181     struct nvme_command *command,
1182     struct nvme_completion *compl)
1183 {
1184         uint16_t nqr;   /* Number of Queues Requested */
1185
1186         if (sc->num_q_is_set) {
1187                 WPRINTF("%s: Number of Queues already set", __func__);
1188                 pci_nvme_status_genc(&compl->status,
1189                     NVME_SC_COMMAND_SEQUENCE_ERROR);
1190                 return;
1191         }
1192
1193         nqr = command->cdw11 & 0xFFFF;
1194         if (nqr == 0xffff) {
1195                 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1196                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1197                 return;
1198         }
1199
1200         sc->num_squeues = ONE_BASED(nqr);
1201         if (sc->num_squeues > sc->max_queues) {
1202                 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1203                                         sc->max_queues);
1204                 sc->num_squeues = sc->max_queues;
1205         }
1206
1207         nqr = (command->cdw11 >> 16) & 0xFFFF;
1208         if (nqr == 0xffff) {
1209                 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1210                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1211                 return;
1212         }
1213
1214         sc->num_cqueues = ONE_BASED(nqr);
1215         if (sc->num_cqueues > sc->max_queues) {
1216                 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1217                                         sc->max_queues);
1218                 sc->num_cqueues = sc->max_queues;
1219         }
1220
1221         /* Patch the command value which will be saved on callback's return */
1222         command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1223         compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1224
1225         sc->num_q_is_set = true;
1226 }
1227
1228 static int
1229 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1230         struct nvme_completion *compl)
1231 {
1232         struct nvme_feature_obj *feat;
1233         uint32_t nsid = command->nsid;
1234         uint8_t fid = command->cdw10 & 0xFF;
1235
1236         DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1237
1238         if (fid >= NVME_FID_MAX) {
1239                 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1240                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1241                 return (1);
1242         }
1243         feat = &sc->feat[fid];
1244
1245         if (!feat->namespace_specific &&
1246             !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1247                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1248                     NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1249                 return (1);
1250         }
1251
1252         compl->cdw0 = 0;
1253         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1254
1255         if (feat->set)
1256                 feat->set(sc, feat, command, compl);
1257
1258         if (compl->status == NVME_SC_SUCCESS)
1259                 feat->cdw11 = command->cdw11;
1260
1261         return (0);
1262 }
1263
1264 static int
1265 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1266         struct nvme_completion* compl)
1267 {
1268         struct nvme_feature_obj *feat;
1269         uint8_t fid = command->cdw10 & 0xFF;
1270
1271         DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1272
1273         if (fid >= NVME_FID_MAX) {
1274                 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1275                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1276                 return (1);
1277         }
1278
1279         compl->cdw0 = 0;
1280         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1281
1282         feat = &sc->feat[fid];
1283         if (feat->get) {
1284                 feat->get(sc, feat, command, compl);
1285         }
1286
1287         if (compl->status == NVME_SC_SUCCESS) {
1288                 compl->cdw0 = feat->cdw11;
1289         }
1290
1291         return (0);
1292 }
1293
1294 static int
1295 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1296         struct nvme_completion* compl)
1297 {
1298         uint8_t ses, lbaf, pi;
1299
1300         /* Only supports Secure Erase Setting - User Data Erase */
1301         ses = (command->cdw10 >> 9) & 0x7;
1302         if (ses > 0x1) {
1303                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1304                 return (1);
1305         }
1306
1307         /* Only supports a single LBA Format */
1308         lbaf = command->cdw10 & 0xf;
1309         if (lbaf != 0) {
1310                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1311                     NVME_SC_INVALID_FORMAT);
1312                 return (1);
1313         }
1314
1315         /* Doesn't support Protection Infomation */
1316         pi = (command->cdw10 >> 5) & 0x7;
1317         if (pi != 0) {
1318                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1319                 return (1);
1320         }
1321
1322         if (sc->nvstore.type == NVME_STOR_RAM) {
1323                 if (sc->nvstore.ctx)
1324                         free(sc->nvstore.ctx);
1325                 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1326                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1327         } else {
1328                 struct pci_nvme_ioreq *req;
1329                 int err;
1330
1331                 req = pci_nvme_get_ioreq(sc);
1332                 if (req == NULL) {
1333                         pci_nvme_status_genc(&compl->status,
1334                             NVME_SC_INTERNAL_DEVICE_ERROR);
1335                         WPRINTF("%s: unable to allocate IO req", __func__);
1336                         return (1);
1337                 }
1338                 req->nvme_sq = &sc->submit_queues[0];
1339                 req->sqid = 0;
1340                 req->opc = command->opc;
1341                 req->cid = command->cid;
1342                 req->nsid = command->nsid;
1343
1344                 req->io_req.br_offset = 0;
1345                 req->io_req.br_resid = sc->nvstore.size;
1346                 req->io_req.br_callback = pci_nvme_io_done;
1347
1348                 err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1349                 if (err) {
1350                         pci_nvme_status_genc(&compl->status,
1351                             NVME_SC_INTERNAL_DEVICE_ERROR);
1352                         pci_nvme_release_ioreq(sc, req);
1353                 }
1354         }
1355
1356         return (1);
1357 }
1358
1359 static int
1360 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1361         struct nvme_completion* compl)
1362 {
1363         DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1364                 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1365
1366         /* TODO: search for the command ID and abort it */
1367
1368         compl->cdw0 = 1;
1369         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1370         return (1);
1371 }
1372
1373 static int
1374 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1375         struct nvme_command* command, struct nvme_completion* compl)
1376 {
1377         DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1378
1379         /*
1380          * TODO: raise events when they happen based on the Set Features cmd.
1381          * These events happen async, so only set completion successful if
1382          * there is an event reflective of the request to get event.
1383          */
1384         pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1385             NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1386         return (0);
1387 }
1388
1389 static void
1390 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1391 {
1392         struct nvme_completion compl;
1393         struct nvme_command *cmd;
1394         struct nvme_submission_queue *sq;
1395         struct nvme_completion_queue *cq;
1396         uint16_t sqhead;
1397
1398         DPRINTF("%s index %u", __func__, (uint32_t)value);
1399
1400         sq = &sc->submit_queues[0];
1401         cq = &sc->compl_queues[0];
1402
1403         pthread_mutex_lock(&sq->mtx);
1404
1405         sqhead = sq->head;
1406         DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1407         
1408         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1409                 cmd = &(sq->qbase)[sqhead];
1410                 compl.cdw0 = 0;
1411                 compl.status = 0;
1412
1413                 switch (cmd->opc) {
1414                 case NVME_OPC_DELETE_IO_SQ:
1415                         DPRINTF("%s command DELETE_IO_SQ", __func__);
1416                         nvme_opc_delete_io_sq(sc, cmd, &compl);
1417                         break;
1418                 case NVME_OPC_CREATE_IO_SQ:
1419                         DPRINTF("%s command CREATE_IO_SQ", __func__);
1420                         nvme_opc_create_io_sq(sc, cmd, &compl);
1421                         break;
1422                 case NVME_OPC_DELETE_IO_CQ:
1423                         DPRINTF("%s command DELETE_IO_CQ", __func__);
1424                         nvme_opc_delete_io_cq(sc, cmd, &compl);
1425                         break;
1426                 case NVME_OPC_CREATE_IO_CQ:
1427                         DPRINTF("%s command CREATE_IO_CQ", __func__);
1428                         nvme_opc_create_io_cq(sc, cmd, &compl);
1429                         break;
1430                 case NVME_OPC_GET_LOG_PAGE:
1431                         DPRINTF("%s command GET_LOG_PAGE", __func__);
1432                         nvme_opc_get_log_page(sc, cmd, &compl);
1433                         break;
1434                 case NVME_OPC_IDENTIFY:
1435                         DPRINTF("%s command IDENTIFY", __func__);
1436                         nvme_opc_identify(sc, cmd, &compl);
1437                         break;
1438                 case NVME_OPC_ABORT:
1439                         DPRINTF("%s command ABORT", __func__);
1440                         nvme_opc_abort(sc, cmd, &compl);
1441                         break;
1442                 case NVME_OPC_SET_FEATURES:
1443                         DPRINTF("%s command SET_FEATURES", __func__);
1444                         nvme_opc_set_features(sc, cmd, &compl);
1445                         break;
1446                 case NVME_OPC_GET_FEATURES:
1447                         DPRINTF("%s command GET_FEATURES", __func__);
1448                         nvme_opc_get_features(sc, cmd, &compl);
1449                         break;
1450                 case NVME_OPC_ASYNC_EVENT_REQUEST:
1451                         DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1452                         /* XXX dont care, unhandled for now
1453                         nvme_opc_async_event_req(sc, cmd, &compl);
1454                         */
1455                         compl.status = NVME_NO_STATUS;
1456                         break;
1457                 case NVME_OPC_FORMAT_NVM:
1458                         DPRINTF("%s command FORMAT_NVM", __func__);
1459                         if ((sc->ctrldata.oacs &
1460                             (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1461                                 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1462                         }
1463                         compl.status = NVME_NO_STATUS;
1464                         nvme_opc_format_nvm(sc, cmd, &compl);
1465                         break;
1466                 default:
1467                         DPRINTF("0x%x command is not implemented",
1468                             cmd->opc);
1469                         pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1470                 }
1471                 sqhead = (sqhead + 1) % sq->size;
1472
1473                 if (NVME_COMPLETION_VALID(compl)) {
1474                         pci_nvme_cq_update(sc, &sc->compl_queues[0],
1475                             compl.cdw0,
1476                             cmd->cid,
1477                             0,          /* SQID */
1478                             compl.status);
1479                 }
1480         }
1481
1482         DPRINTF("setting sqhead %u", sqhead);
1483         sq->head = sqhead;
1484
1485         if (cq->head != cq->tail)
1486                 pci_generate_msix(sc->nsc_pi, 0);
1487
1488         pthread_mutex_unlock(&sq->mtx);
1489 }
1490
1491 /*
1492  * Update the Write and Read statistics reported in SMART data
1493  *
1494  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
1495  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
1496  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
1497  */
1498 static void
1499 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
1500     size_t bytes, uint16_t status)
1501 {
1502
1503         pthread_mutex_lock(&sc->mtx);
1504         switch (opc) {
1505         case NVME_OPC_WRITE:
1506                 sc->write_commands++;
1507                 if (status != NVME_SC_SUCCESS)
1508                         break;
1509                 sc->write_dunits_remainder += (bytes / 512);
1510                 while (sc->write_dunits_remainder >= 1000) {
1511                         sc->write_data_units++;
1512                         sc->write_dunits_remainder -= 1000;
1513                 }
1514                 break;
1515         case NVME_OPC_READ:
1516                 sc->read_commands++;
1517                 if (status != NVME_SC_SUCCESS)
1518                         break;
1519                 sc->read_dunits_remainder += (bytes / 512);
1520                 while (sc->read_dunits_remainder >= 1000) {
1521                         sc->read_data_units++;
1522                         sc->read_dunits_remainder -= 1000;
1523                 }
1524                 break;
1525         default:
1526                 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
1527                 break;
1528         }
1529         pthread_mutex_unlock(&sc->mtx);
1530 }
1531
1532 /*
1533  * Check if the combination of Starting LBA (slba) and Number of Logical
1534  * Blocks (nlb) exceeds the range of the underlying storage.
1535  *
1536  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
1537  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
1538  * overflow.
1539  */
1540 static bool
1541 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
1542     uint32_t nlb)
1543 {
1544         size_t  offset, bytes;
1545
1546         /* Overflow check of multiplying Starting LBA by the sector size */
1547         if (slba >> (64 - nvstore->sectsz_bits))
1548                 return (true);
1549
1550         offset = slba << nvstore->sectsz_bits;
1551         bytes = nlb << nvstore->sectsz_bits;
1552
1553         /* Overflow check of Number of Logical Blocks */
1554         if ((nvstore->size - offset) < bytes)
1555                 return (true);
1556
1557         return (false);
1558 }
1559
1560 static int
1561 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1562         uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1563 {
1564         int iovidx;
1565
1566         if (req == NULL)
1567                 return (-1);
1568
1569         if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
1570                 return (-1);
1571         }
1572
1573         /* concatenate contig block-iovs to minimize number of iovs */
1574         if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1575                 iovidx = req->io_req.br_iovcnt - 1;
1576
1577                 req->io_req.br_iov[iovidx].iov_base =
1578                     paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1579                                      req->prev_gpaddr, size);
1580
1581                 req->prev_size += size;
1582                 req->io_req.br_resid += size;
1583
1584                 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1585         } else {
1586                 iovidx = req->io_req.br_iovcnt;
1587                 if (iovidx == 0) {
1588                         req->io_req.br_offset = lba;
1589                         req->io_req.br_resid = 0;
1590                         req->io_req.br_param = req;
1591                 }
1592
1593                 req->io_req.br_iov[iovidx].iov_base =
1594                     paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1595                                      gpaddr, size);
1596
1597                 req->io_req.br_iov[iovidx].iov_len = size;
1598
1599                 req->prev_gpaddr = gpaddr;
1600                 req->prev_size = size;
1601                 req->io_req.br_resid += size;
1602
1603                 req->io_req.br_iovcnt++;
1604         }
1605
1606         return (0);
1607 }
1608
1609 static void
1610 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1611         struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1612         uint32_t cdw0, uint16_t status)
1613 {
1614         struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1615
1616         DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1617                  __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1618                  NVME_STATUS_GET_SC(status));
1619
1620         pci_nvme_cq_update(sc, cq,
1621             0,          /* CDW0 */
1622             cid,
1623             sqid,
1624             status);
1625
1626         if (cq->head != cq->tail) {
1627                 if (cq->intr_en & NVME_CQ_INTEN) {
1628                         pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1629                 } else {
1630                         DPRINTF("%s: CQ%u interrupt disabled",
1631                                                 __func__, sq->cqid);
1632                 }
1633         }
1634 }
1635
1636 static void
1637 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1638 {
1639         req->sc = NULL;
1640         req->nvme_sq = NULL;
1641         req->sqid = 0;
1642
1643         pthread_mutex_lock(&sc->mtx);
1644
1645         STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1646         sc->pending_ios--;
1647
1648         /* when no more IO pending, can set to ready if device reset/enabled */
1649         if (sc->pending_ios == 0 &&
1650             NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1651                 sc->regs.csts |= NVME_CSTS_RDY;
1652
1653         pthread_mutex_unlock(&sc->mtx);
1654
1655         sem_post(&sc->iosemlock);
1656 }
1657
1658 static struct pci_nvme_ioreq *
1659 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1660 {
1661         struct pci_nvme_ioreq *req = NULL;;
1662
1663         sem_wait(&sc->iosemlock);
1664         pthread_mutex_lock(&sc->mtx);
1665
1666         req = STAILQ_FIRST(&sc->ioreqs_free);
1667         assert(req != NULL);
1668         STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1669
1670         req->sc = sc;
1671
1672         sc->pending_ios++;
1673
1674         pthread_mutex_unlock(&sc->mtx);
1675
1676         req->io_req.br_iovcnt = 0;
1677         req->io_req.br_offset = 0;
1678         req->io_req.br_resid = 0;
1679         req->io_req.br_param = req;
1680         req->prev_gpaddr = 0;
1681         req->prev_size = 0;
1682
1683         return req;
1684 }
1685
1686 static void
1687 pci_nvme_io_done(struct blockif_req *br, int err)
1688 {
1689         struct pci_nvme_ioreq *req = br->br_param;
1690         struct nvme_submission_queue *sq = req->nvme_sq;
1691         uint16_t code, status;
1692
1693         DPRINTF("%s error %d %s", __func__, err, strerror(err));
1694
1695         /* TODO return correct error */
1696         code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1697         pci_nvme_status_genc(&status, code);
1698
1699         pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1700         pci_nvme_stats_write_read_update(req->sc, req->opc,
1701             req->bytes, status);
1702         pci_nvme_release_ioreq(req->sc, req);
1703 }
1704
1705 /*
1706  * Implements the Flush command. The specification states:
1707  *    If a volatile write cache is not present, Flush commands complete
1708  *    successfully and have no effect
1709  * in the description of the Volatile Write Cache (VWC) field of the Identify
1710  * Controller data. Therefore, set status to Success if the command is
1711  * not supported (i.e. RAM or as indicated by the blockif).
1712  */
1713 static bool
1714 nvme_opc_flush(struct pci_nvme_softc *sc,
1715     struct nvme_command *cmd,
1716     struct pci_nvme_blockstore *nvstore,
1717     struct pci_nvme_ioreq *req,
1718     uint16_t *status)
1719 {
1720         bool pending = false;
1721
1722         if (nvstore->type == NVME_STOR_RAM) {
1723                 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1724         } else {
1725                 int err;
1726
1727                 req->io_req.br_callback = pci_nvme_io_done;
1728
1729                 err = blockif_flush(nvstore->ctx, &req->io_req);
1730                 switch (err) {
1731                 case 0:
1732                         pending = true;
1733                         break;
1734                 case EOPNOTSUPP:
1735                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1736                         break;
1737                 default:
1738                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1739                 }
1740         }
1741
1742         return (pending);
1743 }
1744
1745 static uint16_t
1746 nvme_write_read_ram(struct pci_nvme_softc *sc,
1747     struct pci_nvme_blockstore *nvstore,
1748     uint64_t prp1, uint64_t prp2,
1749     size_t offset, uint64_t bytes,
1750     bool is_write)
1751 {
1752         uint8_t *buf = nvstore->ctx;
1753         enum nvme_copy_dir dir;
1754         uint16_t status;
1755
1756         if (is_write)
1757                 dir = NVME_COPY_TO_PRP;
1758         else
1759                 dir = NVME_COPY_FROM_PRP;
1760
1761         if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
1762             buf + offset, bytes, dir))
1763                 pci_nvme_status_genc(&status,
1764                     NVME_SC_DATA_TRANSFER_ERROR);
1765         else
1766                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1767
1768         return (status);
1769 }
1770
1771 static uint16_t
1772 nvme_write_read_blockif(struct pci_nvme_softc *sc,
1773     struct pci_nvme_blockstore *nvstore,
1774     struct pci_nvme_ioreq *req,
1775     uint64_t prp1, uint64_t prp2,
1776     size_t offset, uint64_t bytes,
1777     bool is_write)
1778 {
1779         uint64_t size;
1780         int err;
1781         uint16_t status = NVME_NO_STATUS;
1782
1783         size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
1784         if (pci_nvme_append_iov_req(sc, req, prp1,
1785             size, is_write, offset)) {
1786                 pci_nvme_status_genc(&status,
1787                     NVME_SC_DATA_TRANSFER_ERROR);
1788                 goto out;
1789         }
1790
1791         offset += size;
1792         bytes  -= size;
1793
1794         if (bytes == 0) {
1795                 ;
1796         } else if (bytes <= PAGE_SIZE) {
1797                 size = bytes;
1798                 if (pci_nvme_append_iov_req(sc, req, prp2,
1799                     size, is_write, offset)) {
1800                         pci_nvme_status_genc(&status,
1801                             NVME_SC_DATA_TRANSFER_ERROR);
1802                         goto out;
1803                 }
1804         } else {
1805                 void *vmctx = sc->nsc_pi->pi_vmctx;
1806                 uint64_t *prp_list = &prp2;
1807                 uint64_t *last = prp_list;
1808
1809                 /* PRP2 is pointer to a physical region page list */
1810                 while (bytes) {
1811                         /* Last entry in list points to the next list */
1812                         if (prp_list == last) {
1813                                 uint64_t prp = *prp_list;
1814
1815                                 prp_list = paddr_guest2host(vmctx, prp,
1816                                     PAGE_SIZE - (prp % PAGE_SIZE));
1817                                 last = prp_list + (NVME_PRP2_ITEMS - 1);
1818                         }
1819
1820                         size = MIN(bytes, PAGE_SIZE);
1821
1822                         if (pci_nvme_append_iov_req(sc, req, *prp_list,
1823                             size, is_write, offset)) {
1824                                 pci_nvme_status_genc(&status,
1825                                     NVME_SC_DATA_TRANSFER_ERROR);
1826                                 goto out;
1827                         }
1828
1829                         offset += size;
1830                         bytes  -= size;
1831
1832                         prp_list++;
1833                 }
1834         }
1835         req->io_req.br_callback = pci_nvme_io_done;
1836         if (is_write)
1837                 err = blockif_write(nvstore->ctx, &req->io_req);
1838         else
1839                 err = blockif_read(nvstore->ctx, &req->io_req);
1840
1841         if (err)
1842                 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
1843 out:
1844         return (status);
1845 }
1846
1847 static bool
1848 nvme_opc_write_read(struct pci_nvme_softc *sc,
1849     struct nvme_command *cmd,
1850     struct pci_nvme_blockstore *nvstore,
1851     struct pci_nvme_ioreq *req,
1852     uint16_t *status)
1853 {
1854         uint64_t lba, nblocks, bytes;
1855         size_t offset;
1856         bool is_write = cmd->opc == NVME_OPC_WRITE;
1857         bool pending = false;
1858
1859         lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1860         nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1861         if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
1862                 WPRINTF("%s command would exceed LBA range", __func__);
1863                 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
1864                 goto out;
1865         }
1866
1867         bytes  = nblocks << nvstore->sectsz_bits;
1868         if (bytes > NVME_MAX_DATA_SIZE) {
1869                 WPRINTF("%s command would exceed MDTS", __func__);
1870                 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
1871                 goto out;
1872         }
1873
1874         offset = lba << nvstore->sectsz_bits;
1875
1876         req->bytes = bytes;
1877         req->io_req.br_offset = lba;
1878
1879         /* PRP bits 1:0 must be zero */
1880         cmd->prp1 &= ~0x3UL;
1881         cmd->prp2 &= ~0x3UL;
1882
1883         if (nvstore->type == NVME_STOR_RAM) {
1884                 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
1885                     cmd->prp2, offset, bytes, is_write);
1886         } else {
1887                 *status = nvme_write_read_blockif(sc, nvstore, req,
1888                     cmd->prp1, cmd->prp2, offset, bytes, is_write);
1889
1890                 if (*status == NVME_NO_STATUS)
1891                         pending = true;
1892         }
1893 out:
1894         if (!pending)
1895                 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
1896
1897         return (pending);
1898 }
1899
1900 static void
1901 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
1902 {
1903         struct pci_nvme_ioreq *req = br->br_param;
1904         struct pci_nvme_softc *sc = req->sc;
1905         bool done = true;
1906         uint16_t status;
1907
1908         if (err) {
1909                 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
1910         } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
1911                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1912         } else {
1913                 struct iovec *iov = req->io_req.br_iov;
1914
1915                 req->prev_gpaddr++;
1916                 iov += req->prev_gpaddr;
1917
1918                 /* The iov_* values already include the sector size */
1919                 req->io_req.br_offset = (off_t)iov->iov_base;
1920                 req->io_req.br_resid = iov->iov_len;
1921                 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
1922                         pci_nvme_status_genc(&status,
1923                             NVME_SC_INTERNAL_DEVICE_ERROR);
1924                 } else
1925                         done = false;
1926         }
1927
1928         if (done) {
1929                 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
1930                     req->cid, 0, status);
1931                 pci_nvme_release_ioreq(sc, req);
1932         }
1933 }
1934
1935 static bool
1936 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
1937     struct nvme_command *cmd,
1938     struct pci_nvme_blockstore *nvstore,
1939     struct pci_nvme_ioreq *req,
1940     uint16_t *status)
1941 {
1942         int err;
1943         bool pending = false;
1944
1945         if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
1946                 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
1947                 goto out;
1948         }
1949
1950         if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
1951                 struct nvme_dsm_range *range;
1952                 size_t offset, bytes;
1953                 uint32_t nr, r;
1954                 int sectsz_bits = sc->nvstore.sectsz_bits;
1955
1956                 /*
1957                  * DSM calls are advisory only, and compliant controllers
1958                  * may choose to take no actions (i.e. return Success).
1959                  */
1960                 if (!nvstore->deallocate) {
1961                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1962                         goto out;
1963                 }
1964
1965                 if (req == NULL) {
1966                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1967                         goto out;
1968                 }
1969
1970                 /* copy locally because a range entry could straddle PRPs */
1971                 range = calloc(1, NVME_MAX_DSM_TRIM);
1972                 if (range == NULL) {
1973                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1974                         goto out;
1975                 }
1976                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
1977                     (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
1978
1979                 if (pci_nvme_out_of_range(nvstore, range[0].starting_lba,
1980                     range[0].length)) {
1981                         pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
1982                         goto out;
1983                 }
1984                 offset = range[0].starting_lba << sectsz_bits;
1985                 bytes = range[0].length << sectsz_bits;
1986
1987                 /*
1988                  * If the request is for more than a single range, store
1989                  * the ranges in the br_iov. Optimize for the common case
1990                  * of a single range.
1991                  *
1992                  * Note that NVMe Number of Ranges is a zero based value
1993                  */
1994                 nr = cmd->cdw10 & 0xff;
1995
1996                 req->io_req.br_iovcnt = 0;
1997                 req->io_req.br_offset = offset;
1998                 req->io_req.br_resid = bytes;
1999
2000                 if (nr == 0) {
2001                         req->io_req.br_callback = pci_nvme_io_done;
2002                 } else {
2003                         struct iovec *iov = req->io_req.br_iov;
2004
2005                         for (r = 0; r <= nr; r++) {
2006                                 if (pci_nvme_out_of_range(nvstore, range[r].starting_lba,
2007                                     range[r].length)) {
2008                                         pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2009                                         goto out;
2010                                 }
2011                                 offset = range[r].starting_lba << sectsz_bits;
2012                                 bytes = range[r].length << sectsz_bits;
2013                                 if ((nvstore->size - offset) < bytes) {
2014                                         pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2015                                         goto out;
2016                                 }
2017                                 iov[r].iov_base = (void *)offset;
2018                                 iov[r].iov_len = bytes;
2019                         }
2020                         req->io_req.br_callback = pci_nvme_dealloc_sm;
2021
2022                         /*
2023                          * Use prev_gpaddr to track the current entry and
2024                          * prev_size to track the number of entries
2025                          */
2026                         req->prev_gpaddr = 0;
2027                         req->prev_size = r;
2028                 }
2029
2030                 err = blockif_delete(nvstore->ctx, &req->io_req);
2031                 if (err)
2032                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2033                 else
2034                         pending = true;
2035
2036                 free(range);
2037         }
2038 out:
2039         return (pending);
2040 }
2041
2042 static void
2043 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2044 {
2045         struct nvme_submission_queue *sq;
2046         uint16_t status;
2047         uint16_t sqhead;
2048
2049         /* handle all submissions up to sq->tail index */
2050         sq = &sc->submit_queues[idx];
2051
2052         pthread_mutex_lock(&sq->mtx);
2053
2054         sqhead = sq->head;
2055         DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2056                  idx, sqhead, sq->tail, sq->qbase);
2057
2058         while (sqhead != atomic_load_acq_short(&sq->tail)) {
2059                 struct nvme_command *cmd;
2060                 struct pci_nvme_ioreq *req;
2061                 uint32_t nsid;
2062                 bool pending;
2063
2064                 pending = false;
2065                 req = NULL;
2066                 status = 0;
2067
2068                 cmd = &sq->qbase[sqhead];
2069                 sqhead = (sqhead + 1) % sq->size;
2070
2071                 nsid = le32toh(cmd->nsid);
2072                 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2073                         pci_nvme_status_genc(&status,
2074                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2075                         status |=
2076                             NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2077                         goto complete;
2078                 }
2079
2080                 req = pci_nvme_get_ioreq(sc);
2081                 if (req == NULL) {
2082                         pci_nvme_status_genc(&status,
2083                             NVME_SC_INTERNAL_DEVICE_ERROR);
2084                         WPRINTF("%s: unable to allocate IO req", __func__);
2085                         goto complete;
2086                 }
2087                 req->nvme_sq = sq;
2088                 req->sqid = idx;
2089                 req->opc = cmd->opc;
2090                 req->cid = cmd->cid;
2091                 req->nsid = cmd->nsid;
2092
2093                 switch (cmd->opc) {
2094                 case NVME_OPC_FLUSH:
2095                         pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2096                             req, &status);
2097                         break;
2098                 case NVME_OPC_WRITE:
2099                 case NVME_OPC_READ:
2100                         pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2101                             req, &status);
2102                         break;
2103                 case NVME_OPC_WRITE_ZEROES:
2104                         /* TODO: write zeroes
2105                         WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2106                                 __func__, lba, cmd->cdw12 & 0xFFFF); */
2107                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2108                         break;
2109                 case NVME_OPC_DATASET_MANAGEMENT:
2110                         pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2111                             req, &status);
2112                         break;
2113                 default:
2114                         WPRINTF("%s unhandled io command 0x%x",
2115                             __func__, cmd->opc);
2116                         pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2117                 }
2118 complete:
2119                 if (!pending) {
2120                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2121                             status);
2122                         if (req != NULL)
2123                                 pci_nvme_release_ioreq(sc, req);
2124                 }
2125         }
2126
2127         sq->head = sqhead;
2128
2129         pthread_mutex_unlock(&sq->mtx);
2130 }
2131
2132 static void
2133 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2134         uint64_t idx, int is_sq, uint64_t value)
2135 {
2136         DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2137                 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2138
2139         if (is_sq) {
2140                 if (idx > sc->num_squeues) {
2141                         WPRINTF("%s queue index %lu overflow from "
2142                                  "guest (max %u)",
2143                                  __func__, idx, sc->num_squeues);
2144                         return;
2145                 }
2146
2147                 atomic_store_short(&sc->submit_queues[idx].tail,
2148                                    (uint16_t)value);
2149
2150                 if (idx == 0) {
2151                         pci_nvme_handle_admin_cmd(sc, value);
2152                 } else {
2153                         /* submission queue; handle new entries in SQ */
2154                         if (idx > sc->num_squeues) {
2155                                 WPRINTF("%s SQ index %lu overflow from "
2156                                          "guest (max %u)",
2157                                          __func__, idx, sc->num_squeues);
2158                                 return;
2159                         }
2160                         pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2161                 }
2162         } else {
2163                 if (idx > sc->num_cqueues) {
2164                         WPRINTF("%s queue index %lu overflow from "
2165                                  "guest (max %u)",
2166                                  __func__, idx, sc->num_cqueues);
2167                         return;
2168                 }
2169
2170                 atomic_store_short(&sc->compl_queues[idx].head,
2171                                 (uint16_t)value);
2172         }
2173 }
2174
2175 static void
2176 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2177 {
2178         const char *s = iswrite ? "WRITE" : "READ";
2179
2180         switch (offset) {
2181         case NVME_CR_CAP_LOW:
2182                 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2183                 break;
2184         case NVME_CR_CAP_HI:
2185                 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2186                 break;
2187         case NVME_CR_VS:
2188                 DPRINTF("%s %s NVME_CR_VS", func, s);
2189                 break;
2190         case NVME_CR_INTMS:
2191                 DPRINTF("%s %s NVME_CR_INTMS", func, s);
2192                 break;
2193         case NVME_CR_INTMC:
2194                 DPRINTF("%s %s NVME_CR_INTMC", func, s);
2195                 break;
2196         case NVME_CR_CC:
2197                 DPRINTF("%s %s NVME_CR_CC", func, s);
2198                 break;
2199         case NVME_CR_CSTS:
2200                 DPRINTF("%s %s NVME_CR_CSTS", func, s);
2201                 break;
2202         case NVME_CR_NSSR:
2203                 DPRINTF("%s %s NVME_CR_NSSR", func, s);
2204                 break;
2205         case NVME_CR_AQA:
2206                 DPRINTF("%s %s NVME_CR_AQA", func, s);
2207                 break;
2208         case NVME_CR_ASQ_LOW:
2209                 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2210                 break;
2211         case NVME_CR_ASQ_HI:
2212                 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2213                 break;
2214         case NVME_CR_ACQ_LOW:
2215                 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2216                 break;
2217         case NVME_CR_ACQ_HI:
2218                 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2219                 break;
2220         default:
2221                 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2222         }
2223
2224 }
2225
2226 static void
2227 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2228         uint64_t offset, int size, uint64_t value)
2229 {
2230         uint32_t ccreg;
2231
2232         if (offset >= NVME_DOORBELL_OFFSET) {
2233                 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2234                 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2235                 int is_sq = (belloffset % 8) < 4;
2236
2237                 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2238                         WPRINTF("guest attempted an overflow write offset "
2239                                  "0x%lx, val 0x%lx in %s",
2240                                  offset, value, __func__);
2241                         return;
2242                 }
2243
2244                 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2245                 return;
2246         }
2247
2248         DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2249                 offset, size, value);
2250
2251         if (size != 4) {
2252                 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2253                          "val 0x%lx) to bar0 in %s",
2254                          size, offset, value, __func__);
2255                 /* TODO: shutdown device */
2256                 return;
2257         }
2258
2259         pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2260
2261         pthread_mutex_lock(&sc->mtx);
2262
2263         switch (offset) {
2264         case NVME_CR_CAP_LOW:
2265         case NVME_CR_CAP_HI:
2266                 /* readonly */
2267                 break;
2268         case NVME_CR_VS:
2269                 /* readonly */
2270                 break;
2271         case NVME_CR_INTMS:
2272                 /* MSI-X, so ignore */
2273                 break;
2274         case NVME_CR_INTMC:
2275                 /* MSI-X, so ignore */
2276                 break;
2277         case NVME_CR_CC:
2278                 ccreg = (uint32_t)value;
2279
2280                 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2281                          "iocqes %u",
2282                         __func__,
2283                          NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2284                          NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2285                          NVME_CC_GET_IOCQES(ccreg));
2286
2287                 if (NVME_CC_GET_SHN(ccreg)) {
2288                         /* perform shutdown - flush out data to backend */
2289                         sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2290                             NVME_CSTS_REG_SHST_SHIFT);
2291                         sc->regs.csts |= NVME_SHST_COMPLETE <<
2292                             NVME_CSTS_REG_SHST_SHIFT;
2293                 }
2294                 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2295                         if (NVME_CC_GET_EN(ccreg) == 0)
2296                                 /* transition 1-> causes controller reset */
2297                                 pci_nvme_reset_locked(sc);
2298                         else
2299                                 pci_nvme_init_controller(ctx, sc);
2300                 }
2301
2302                 /* Insert the iocqes, iosqes and en bits from the write */
2303                 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2304                 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2305                 if (NVME_CC_GET_EN(ccreg) == 0) {
2306                         /* Insert the ams, mps and css bit fields */
2307                         sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2308                         sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2309                         sc->regs.csts &= ~NVME_CSTS_RDY;
2310                 } else if (sc->pending_ios == 0) {
2311                         sc->regs.csts |= NVME_CSTS_RDY;
2312                 }
2313                 break;
2314         case NVME_CR_CSTS:
2315                 break;
2316         case NVME_CR_NSSR:
2317                 /* ignore writes; don't support subsystem reset */
2318                 break;
2319         case NVME_CR_AQA:
2320                 sc->regs.aqa = (uint32_t)value;
2321                 break;
2322         case NVME_CR_ASQ_LOW:
2323                 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2324                                (0xFFFFF000 & value);
2325                 break;
2326         case NVME_CR_ASQ_HI:
2327                 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2328                                (value << 32);
2329                 break;
2330         case NVME_CR_ACQ_LOW:
2331                 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2332                                (0xFFFFF000 & value);
2333                 break;
2334         case NVME_CR_ACQ_HI:
2335                 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2336                                (value << 32);
2337                 break;
2338         default:
2339                 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2340                          __func__, offset, value, size);
2341         }
2342         pthread_mutex_unlock(&sc->mtx);
2343 }
2344
2345 static void
2346 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2347                 int baridx, uint64_t offset, int size, uint64_t value)
2348 {
2349         struct pci_nvme_softc* sc = pi->pi_arg;
2350
2351         if (baridx == pci_msix_table_bar(pi) ||
2352             baridx == pci_msix_pba_bar(pi)) {
2353                 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2354                          " value 0x%lx", baridx, offset, size, value);
2355
2356                 pci_emul_msix_twrite(pi, offset, size, value);
2357                 return;
2358         }
2359
2360         switch (baridx) {
2361         case 0:
2362                 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2363                 break;
2364
2365         default:
2366                 DPRINTF("%s unknown baridx %d, val 0x%lx",
2367                          __func__, baridx, value);
2368         }
2369 }
2370
2371 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2372         uint64_t offset, int size)
2373 {
2374         uint64_t value;
2375
2376         pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2377
2378         if (offset < NVME_DOORBELL_OFFSET) {
2379                 void *p = &(sc->regs);
2380                 pthread_mutex_lock(&sc->mtx);
2381                 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2382                 pthread_mutex_unlock(&sc->mtx);
2383         } else {
2384                 value = 0;
2385                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2386         }
2387
2388         switch (size) {
2389         case 1:
2390                 value &= 0xFF;
2391                 break;
2392         case 2:
2393                 value &= 0xFFFF;
2394                 break;
2395         case 4:
2396                 value &= 0xFFFFFFFF;
2397                 break;
2398         }
2399
2400         DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
2401                  offset, size, (uint32_t)value);
2402
2403         return (value);
2404 }
2405
2406
2407
2408 static uint64_t
2409 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2410     uint64_t offset, int size)
2411 {
2412         struct pci_nvme_softc* sc = pi->pi_arg;
2413
2414         if (baridx == pci_msix_table_bar(pi) ||
2415             baridx == pci_msix_pba_bar(pi)) {
2416                 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2417                         baridx, offset, size);
2418
2419                 return pci_emul_msix_tread(pi, offset, size);
2420         }
2421
2422         switch (baridx) {
2423         case 0:
2424                 return pci_nvme_read_bar_0(sc, offset, size);
2425
2426         default:
2427                 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2428         }
2429
2430         return (0);
2431 }
2432
2433
2434 static int
2435 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2436 {
2437         char bident[sizeof("XX:X:X")];
2438         char    *uopt, *xopts, *config;
2439         uint32_t sectsz;
2440         int optidx;
2441
2442         sc->max_queues = NVME_QUEUES;
2443         sc->max_qentries = NVME_MAX_QENTRIES;
2444         sc->ioslots = NVME_IOSLOTS;
2445         sc->num_squeues = sc->max_queues;
2446         sc->num_cqueues = sc->max_queues;
2447         sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2448         sectsz = 0;
2449
2450         uopt = strdup(opts);
2451         optidx = 0;
2452         snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2453                  "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2454         for (xopts = strtok(uopt, ",");
2455              xopts != NULL;
2456              xopts = strtok(NULL, ",")) {
2457
2458                 if ((config = strchr(xopts, '=')) != NULL)
2459                         *config++ = '\0';
2460
2461                 if (!strcmp("maxq", xopts)) {
2462                         sc->max_queues = atoi(config);
2463                 } else if (!strcmp("qsz", xopts)) {
2464                         sc->max_qentries = atoi(config);
2465                 } else if (!strcmp("ioslots", xopts)) {
2466                         sc->ioslots = atoi(config);
2467                 } else if (!strcmp("sectsz", xopts)) {
2468                         sectsz = atoi(config);
2469                 } else if (!strcmp("ser", xopts)) {
2470                         /*
2471                          * This field indicates the Product Serial Number in
2472                          * 7-bit ASCII, unused bytes should be space characters.
2473                          * Ref: NVMe v1.3c.
2474                          */
2475                         cpywithpad((char *)sc->ctrldata.sn,
2476                                    sizeof(sc->ctrldata.sn), config, ' ');
2477                 } else if (!strcmp("ram", xopts)) {
2478                         uint64_t sz = strtoull(&xopts[4], NULL, 10);
2479
2480                         sc->nvstore.type = NVME_STOR_RAM;
2481                         sc->nvstore.size = sz * 1024 * 1024;
2482                         sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2483                         sc->nvstore.sectsz = 4096;
2484                         sc->nvstore.sectsz_bits = 12;
2485                         if (sc->nvstore.ctx == NULL) {
2486                                 perror("Unable to allocate RAM");
2487                                 free(uopt);
2488                                 return (-1);
2489                         }
2490                 } else if (!strcmp("eui64", xopts)) {
2491                         sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2492                 } else if (!strcmp("dsm", xopts)) {
2493                         if (!strcmp("auto", config))
2494                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2495                         else if (!strcmp("enable", config))
2496                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2497                         else if (!strcmp("disable", config))
2498                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2499                 } else if (optidx == 0) {
2500                         snprintf(bident, sizeof(bident), "%d:%d",
2501                                  sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2502                         sc->nvstore.ctx = blockif_open(xopts, bident);
2503                         if (sc->nvstore.ctx == NULL) {
2504                                 perror("Could not open backing file");
2505                                 free(uopt);
2506                                 return (-1);
2507                         }
2508                         sc->nvstore.type = NVME_STOR_BLOCKIF;
2509                         sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2510                 } else {
2511                         EPRINTLN("Invalid option %s", xopts);
2512                         free(uopt);
2513                         return (-1);
2514                 }
2515
2516                 optidx++;
2517         }
2518         free(uopt);
2519
2520         if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2521                 EPRINTLN("backing store not specified");
2522                 return (-1);
2523         }
2524         if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2525                 sc->nvstore.sectsz = sectsz;
2526         else if (sc->nvstore.type != NVME_STOR_RAM)
2527                 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2528         for (sc->nvstore.sectsz_bits = 9;
2529              (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2530              sc->nvstore.sectsz_bits++);
2531
2532         if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2533                 sc->max_queues = NVME_QUEUES;
2534
2535         if (sc->max_qentries <= 0) {
2536                 EPRINTLN("Invalid qsz option");
2537                 return (-1);
2538         }
2539         if (sc->ioslots <= 0) {
2540                 EPRINTLN("Invalid ioslots option");
2541                 return (-1);
2542         }
2543
2544         return (0);
2545 }
2546
2547 static int
2548 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2549 {
2550         struct pci_nvme_softc *sc;
2551         uint32_t pci_membar_sz;
2552         int     error;
2553
2554         error = 0;
2555
2556         sc = calloc(1, sizeof(struct pci_nvme_softc));
2557         pi->pi_arg = sc;
2558         sc->nsc_pi = pi;
2559
2560         error = pci_nvme_parse_opts(sc, opts);
2561         if (error < 0)
2562                 goto done;
2563         else
2564                 error = 0;
2565
2566         STAILQ_INIT(&sc->ioreqs_free);
2567         sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2568         for (int i = 0; i < sc->ioslots; i++) {
2569                 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2570         }
2571
2572         pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2573         pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2574         pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2575         pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2576         pci_set_cfgdata8(pi, PCIR_PROGIF,
2577                          PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2578
2579         /*
2580          * Allocate size of NVMe registers + doorbell space for all queues.
2581          *
2582          * The specification requires a minimum memory I/O window size of 16K.
2583          * The Windows driver will refuse to start a device with a smaller
2584          * window.
2585          */
2586         pci_membar_sz = sizeof(struct nvme_registers) +
2587             2 * sizeof(uint32_t) * (sc->max_queues + 1);
2588         pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2589
2590         DPRINTF("nvme membar size: %u", pci_membar_sz);
2591
2592         error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2593         if (error) {
2594                 WPRINTF("%s pci alloc mem bar failed", __func__);
2595                 goto done;
2596         }
2597
2598         error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2599         if (error) {
2600                 WPRINTF("%s pci add msixcap failed", __func__);
2601                 goto done;
2602         }
2603
2604         error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2605         if (error) {
2606                 WPRINTF("%s pci add Express capability failed", __func__);
2607                 goto done;
2608         }
2609
2610         pthread_mutex_init(&sc->mtx, NULL);
2611         sem_init(&sc->iosemlock, 0, sc->ioslots);
2612
2613         pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2614         /*
2615          * Controller data depends on Namespace data so initialize Namespace
2616          * data first.
2617          */
2618         pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2619         pci_nvme_init_ctrldata(sc);
2620         pci_nvme_init_logpages(sc);
2621         pci_nvme_init_features(sc);
2622
2623         pci_nvme_reset(sc);
2624
2625         pci_lintr_request(pi);
2626
2627 done:
2628         return (error);
2629 }
2630
2631
2632 struct pci_devemu pci_de_nvme = {
2633         .pe_emu =       "nvme",
2634         .pe_init =      pci_nvme_init,
2635         .pe_barwrite =  pci_nvme_write,
2636         .pe_barread =   pci_nvme_read
2637 };
2638 PCI_EMUL_SET(pci_de_nvme);