]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/bhyve/pci_nvme.c
bhyve: add basic NVMe Firmware Commit support
[FreeBSD/FreeBSD.git] / usr.sbin / bhyve / pci_nvme.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov 
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65
66 #include <assert.h>
67 #include <pthread.h>
68 #include <semaphore.h>
69 #include <stdbool.h>
70 #include <stddef.h>
71 #include <stdint.h>
72 #include <stdio.h>
73 #include <stdlib.h>
74 #include <string.h>
75
76 #include <machine/atomic.h>
77 #include <machine/vmm.h>
78 #include <vmmapi.h>
79
80 #include <dev/nvme/nvme.h>
81
82 #include "bhyverun.h"
83 #include "block_if.h"
84 #include "debug.h"
85 #include "pci_emul.h"
86
87
88 static int nvme_debug = 0;
89 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
90 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
91
92 /* defaults; can be overridden */
93 #define NVME_MSIX_BAR           4
94
95 #define NVME_IOSLOTS            8
96
97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
98 #define NVME_MMIO_SPACE_MIN     (1 << 14)
99
100 #define NVME_QUEUES             16
101 #define NVME_MAX_QENTRIES       2048
102 /* Memory Page size Minimum reported in CAP register */
103 #define NVME_MPSMIN             0
104 /* MPSMIN converted to bytes */
105 #define NVME_MPSMIN_BYTES       (1 << (12 + NVME_MPSMIN))
106
107 #define NVME_PRP2_ITEMS         (PAGE_SIZE/sizeof(uint64_t))
108 #define NVME_MDTS               9
109 /* Note the + 1 allows for the initial descriptor to not be page aligned */
110 #define NVME_MAX_IOVEC          ((1 << NVME_MDTS) + 1)
111 #define NVME_MAX_DATA_SIZE      ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
112
113 /* This is a synthetic status code to indicate there is no status */
114 #define NVME_NO_STATUS          0xffff
115 #define NVME_COMPLETION_VALID(c)        ((c).status != NVME_NO_STATUS)
116
117 /* helpers */
118
119 /* Convert a zero-based value into a one-based value */
120 #define ONE_BASED(zero)         ((zero) + 1)
121 /* Convert a one-based value into a zero-based value */
122 #define ZERO_BASED(one)         ((one)  - 1)
123
124 /* Encode number of SQ's and CQ's for Set/Get Features */
125 #define NVME_FEATURE_NUM_QUEUES(sc) \
126         (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
127         (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
128
129 #define NVME_DOORBELL_OFFSET    offsetof(struct nvme_registers, doorbell)
130
131 enum nvme_controller_register_offsets {
132         NVME_CR_CAP_LOW = 0x00,
133         NVME_CR_CAP_HI  = 0x04,
134         NVME_CR_VS      = 0x08,
135         NVME_CR_INTMS   = 0x0c,
136         NVME_CR_INTMC   = 0x10,
137         NVME_CR_CC      = 0x14,
138         NVME_CR_CSTS    = 0x1c,
139         NVME_CR_NSSR    = 0x20,
140         NVME_CR_AQA     = 0x24,
141         NVME_CR_ASQ_LOW = 0x28,
142         NVME_CR_ASQ_HI  = 0x2c,
143         NVME_CR_ACQ_LOW = 0x30,
144         NVME_CR_ACQ_HI  = 0x34,
145 };
146
147 enum nvme_cmd_cdw11 {
148         NVME_CMD_CDW11_PC  = 0x0001,
149         NVME_CMD_CDW11_IEN = 0x0002,
150         NVME_CMD_CDW11_IV  = 0xFFFF0000,
151 };
152
153 enum nvme_copy_dir {
154         NVME_COPY_TO_PRP,
155         NVME_COPY_FROM_PRP,
156 };
157
158 #define NVME_CQ_INTEN   0x01
159 #define NVME_CQ_INTCOAL 0x02
160
161 struct nvme_completion_queue {
162         struct nvme_completion *qbase;
163         pthread_mutex_t mtx;
164         uint32_t        size;
165         uint16_t        tail; /* nvme progress */
166         uint16_t        head; /* guest progress */
167         uint16_t        intr_vec;
168         uint32_t        intr_en;
169 };
170
171 struct nvme_submission_queue {
172         struct nvme_command *qbase;
173         pthread_mutex_t mtx;
174         uint32_t        size;
175         uint16_t        head; /* nvme progress */
176         uint16_t        tail; /* guest progress */
177         uint16_t        cqid; /* completion queue id */
178         int             qpriority;
179 };
180
181 enum nvme_storage_type {
182         NVME_STOR_BLOCKIF = 0,
183         NVME_STOR_RAM = 1,
184 };
185
186 struct pci_nvme_blockstore {
187         enum nvme_storage_type type;
188         void            *ctx;
189         uint64_t        size;
190         uint32_t        sectsz;
191         uint32_t        sectsz_bits;
192         uint64_t        eui64;
193         uint32_t        deallocate:1;
194 };
195
196 /*
197  * Calculate the number of additional page descriptors for guest IO requests
198  * based on the advertised Max Data Transfer (MDTS) and given the number of
199  * default iovec's in a struct blockif_req.
200  *
201  * Note the + 1 allows for the initial descriptor to not be page aligned.
202  */
203 #define MDTS_PAD_SIZE \
204         NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
205         NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
206         0
207
208 struct pci_nvme_ioreq {
209         struct pci_nvme_softc *sc;
210         STAILQ_ENTRY(pci_nvme_ioreq) link;
211         struct nvme_submission_queue *nvme_sq;
212         uint16_t        sqid;
213
214         /* command information */
215         uint16_t        opc;
216         uint16_t        cid;
217         uint32_t        nsid;
218
219         uint64_t        prev_gpaddr;
220         size_t          prev_size;
221         size_t          bytes;
222
223         struct blockif_req io_req;
224
225         struct iovec    iovpadding[MDTS_PAD_SIZE];
226 };
227
228 enum nvme_dsm_type {
229         /* Dataset Management bit in ONCS reflects backing storage capability */
230         NVME_DATASET_MANAGEMENT_AUTO,
231         /* Unconditionally set Dataset Management bit in ONCS */
232         NVME_DATASET_MANAGEMENT_ENABLE,
233         /* Unconditionally clear Dataset Management bit in ONCS */
234         NVME_DATASET_MANAGEMENT_DISABLE,
235 };
236
237 struct pci_nvme_softc;
238 struct nvme_feature_obj;
239
240 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
241     struct nvme_feature_obj *,
242     struct nvme_command *,
243     struct nvme_completion *);
244
245 struct nvme_feature_obj {
246         uint32_t        cdw11;
247         nvme_feature_cb set;
248         nvme_feature_cb get;
249         bool namespace_specific;
250 };
251
252 #define NVME_FID_MAX            (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
253
254 struct pci_nvme_aer {
255         STAILQ_ENTRY(pci_nvme_aer) link;
256         uint16_t        cid;    /* Command ID of the submitted AER */
257 };
258
259 struct pci_nvme_softc {
260         struct pci_devinst *nsc_pi;
261
262         pthread_mutex_t mtx;
263
264         struct nvme_registers regs;
265
266         struct nvme_namespace_data  nsdata;
267         struct nvme_controller_data ctrldata;
268         struct nvme_error_information_entry err_log;
269         struct nvme_health_information_page health_log;
270         struct nvme_firmware_page fw_log;
271
272         struct pci_nvme_blockstore nvstore;
273
274         uint16_t        max_qentries;   /* max entries per queue */
275         uint32_t        max_queues;     /* max number of IO SQ's or CQ's */
276         uint32_t        num_cqueues;
277         uint32_t        num_squeues;
278         bool            num_q_is_set; /* Has host set Number of Queues */
279
280         struct pci_nvme_ioreq *ioreqs;
281         STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
282         uint32_t        pending_ios;
283         uint32_t        ioslots;
284         sem_t           iosemlock;
285
286         /*
287          * Memory mapped Submission and Completion queues
288          * Each array includes both Admin and IO queues
289          */
290         struct nvme_completion_queue *compl_queues;
291         struct nvme_submission_queue *submit_queues;
292
293         struct nvme_feature_obj feat[NVME_FID_MAX];
294
295         enum nvme_dsm_type dataset_management;
296
297         /* Accounting for SMART data */
298         __uint128_t     read_data_units;
299         __uint128_t     write_data_units;
300         __uint128_t     read_commands;
301         __uint128_t     write_commands;
302         uint32_t        read_dunits_remainder;
303         uint32_t        write_dunits_remainder;
304
305         STAILQ_HEAD(, pci_nvme_aer) aer_list;
306         uint32_t        aer_count;
307 };
308
309
310 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
311 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
312 static void pci_nvme_io_done(struct blockif_req *, int);
313
314 /* Controller Configuration utils */
315 #define NVME_CC_GET_EN(cc) \
316         ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
317 #define NVME_CC_GET_CSS(cc) \
318         ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
319 #define NVME_CC_GET_SHN(cc) \
320         ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
321 #define NVME_CC_GET_IOSQES(cc) \
322         ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
323 #define NVME_CC_GET_IOCQES(cc) \
324         ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
325
326 #define NVME_CC_WRITE_MASK \
327         ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
328          (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
329          (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
330
331 #define NVME_CC_NEN_WRITE_MASK \
332         ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
333          (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
334          (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
335
336 /* Controller Status utils */
337 #define NVME_CSTS_GET_RDY(sts) \
338         ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
339
340 #define NVME_CSTS_RDY   (1 << NVME_CSTS_REG_RDY_SHIFT)
341
342 /* Completion Queue status word utils */
343 #define NVME_STATUS_P   (1 << NVME_STATUS_P_SHIFT)
344 #define NVME_STATUS_MASK \
345         ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
346          (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
347
348 #define NVME_ONCS_DSM   (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
349         NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
350
351 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
352     struct nvme_feature_obj *,
353     struct nvme_command *,
354     struct nvme_completion *);
355 static void nvme_feature_num_queues(struct pci_nvme_softc *,
356     struct nvme_feature_obj *,
357     struct nvme_command *,
358     struct nvme_completion *);
359
360 static __inline void
361 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
362 {
363         size_t len;
364
365         len = strnlen(src, dst_size);
366         memset(dst, pad, dst_size);
367         memcpy(dst, src, len);
368 }
369
370 static __inline void
371 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
372 {
373
374         *status &= ~NVME_STATUS_MASK;
375         *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
376                 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
377 }
378
379 static __inline void
380 pci_nvme_status_genc(uint16_t *status, uint16_t code)
381 {
382
383         pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
384 }
385
386 /*
387  * Initialize the requested number or IO Submission and Completion Queues.
388  * Admin queues are allocated implicitly.
389  */
390 static void
391 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
392 {
393         uint32_t i;
394
395         /*
396          * Allocate and initialize the Submission Queues
397          */
398         if (nsq > NVME_QUEUES) {
399                 WPRINTF("%s: clamping number of SQ from %u to %u",
400                                         __func__, nsq, NVME_QUEUES);
401                 nsq = NVME_QUEUES;
402         }
403
404         sc->num_squeues = nsq;
405
406         sc->submit_queues = calloc(sc->num_squeues + 1,
407                                 sizeof(struct nvme_submission_queue));
408         if (sc->submit_queues == NULL) {
409                 WPRINTF("%s: SQ allocation failed", __func__);
410                 sc->num_squeues = 0;
411         } else {
412                 struct nvme_submission_queue *sq = sc->submit_queues;
413
414                 for (i = 0; i < sc->num_squeues; i++)
415                         pthread_mutex_init(&sq[i].mtx, NULL);
416         }
417
418         /*
419          * Allocate and initialize the Completion Queues
420          */
421         if (ncq > NVME_QUEUES) {
422                 WPRINTF("%s: clamping number of CQ from %u to %u",
423                                         __func__, ncq, NVME_QUEUES);
424                 ncq = NVME_QUEUES;
425         }
426
427         sc->num_cqueues = ncq;
428
429         sc->compl_queues = calloc(sc->num_cqueues + 1,
430                                 sizeof(struct nvme_completion_queue));
431         if (sc->compl_queues == NULL) {
432                 WPRINTF("%s: CQ allocation failed", __func__);
433                 sc->num_cqueues = 0;
434         } else {
435                 struct nvme_completion_queue *cq = sc->compl_queues;
436
437                 for (i = 0; i < sc->num_cqueues; i++)
438                         pthread_mutex_init(&cq[i].mtx, NULL);
439         }
440 }
441
442 static void
443 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
444 {
445         struct nvme_controller_data *cd = &sc->ctrldata;
446
447         cd->vid = 0xFB5D;
448         cd->ssvid = 0x0000;
449
450         cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
451         cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
452
453         /* Num of submission commands that we can handle at a time (2^rab) */
454         cd->rab   = 4;
455
456         /* FreeBSD OUI */
457         cd->ieee[0] = 0x58;
458         cd->ieee[1] = 0x9c;
459         cd->ieee[2] = 0xfc;
460
461         cd->mic = 0;
462
463         cd->mdts = NVME_MDTS;   /* max data transfer size (2^mdts * CAP.MPSMIN) */
464
465         cd->ver = 0x00010300;
466
467         cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
468         cd->acl = 2;
469         cd->aerl = 4;
470
471         /* Advertise 1, Read-only firmware slot */
472         cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
473             (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
474         cd->lpa = 0;    /* TODO: support some simple things like SMART */
475         cd->elpe = 0;   /* max error log page entries */
476         cd->npss = 1;   /* number of power states support */
477
478         /* Warning Composite Temperature Threshold */
479         cd->wctemp = 0x0157;
480
481         cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
482             (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
483         cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
484             (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
485         cd->nn = 1;     /* number of namespaces */
486
487         cd->oncs = 0;
488         switch (sc->dataset_management) {
489         case NVME_DATASET_MANAGEMENT_AUTO:
490                 if (sc->nvstore.deallocate)
491                         cd->oncs |= NVME_ONCS_DSM;
492                 break;
493         case NVME_DATASET_MANAGEMENT_ENABLE:
494                 cd->oncs |= NVME_ONCS_DSM;
495                 break;
496         default:
497                 break;
498         }
499
500         cd->fna = 0x03;
501
502         cd->power_state[0].mp = 10;
503 }
504
505 /*
506  * Calculate the CRC-16 of the given buffer
507  * See copyright attribution at top of file
508  */
509 static uint16_t
510 crc16(uint16_t crc, const void *buffer, unsigned int len)
511 {
512         const unsigned char *cp = buffer;
513         /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
514         static uint16_t const crc16_table[256] = {
515                 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
516                 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
517                 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
518                 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
519                 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
520                 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
521                 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
522                 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
523                 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
524                 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
525                 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
526                 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
527                 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
528                 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
529                 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
530                 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
531                 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
532                 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
533                 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
534                 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
535                 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
536                 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
537                 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
538                 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
539                 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
540                 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
541                 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
542                 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
543                 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
544                 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
545                 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
546                 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
547         };
548
549         while (len--)
550                 crc = (((crc >> 8) & 0xffU) ^
551                     crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
552         return crc;
553 }
554
555 static void
556 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
557     struct nvme_namespace_data *nd, uint32_t nsid,
558     struct pci_nvme_blockstore *nvstore)
559 {
560
561         /* Get capacity and block size information from backing store */
562         nd->nsze = nvstore->size / nvstore->sectsz;
563         nd->ncap = nd->nsze;
564         nd->nuse = nd->nsze;
565
566         if (nvstore->type == NVME_STOR_BLOCKIF)
567                 nvstore->deallocate = blockif_candelete(nvstore->ctx);
568
569         nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
570         nd->flbas = 0;
571
572         /* Create an EUI-64 if user did not provide one */
573         if (nvstore->eui64 == 0) {
574                 char *data = NULL;
575                 uint64_t eui64 = nvstore->eui64;
576
577                 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
578                     sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
579
580                 if (data != NULL) {
581                         eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
582                         free(data);
583                 }
584                 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
585         }
586         be64enc(nd->eui64, nvstore->eui64);
587
588         /* LBA data-sz = 2^lbads */
589         nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
590 }
591
592 static void
593 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
594 {
595
596         memset(&sc->err_log, 0, sizeof(sc->err_log));
597         memset(&sc->health_log, 0, sizeof(sc->health_log));
598         memset(&sc->fw_log, 0, sizeof(sc->fw_log));
599
600         /* Set read/write remainder to round up according to spec */
601         sc->read_dunits_remainder = 999;
602         sc->write_dunits_remainder = 999;
603 }
604
605 static void
606 pci_nvme_init_features(struct pci_nvme_softc *sc)
607 {
608
609         sc->feat[0].set = nvme_feature_invalid_cb;
610         sc->feat[0].get = nvme_feature_invalid_cb;
611
612         sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
613         sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
614         sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
615 }
616
617 static void
618 pci_nvme_aer_init(struct pci_nvme_softc *sc)
619 {
620
621         STAILQ_INIT(&sc->aer_list);
622         sc->aer_count = 0;
623 }
624
625 static void
626 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
627 {
628         struct pci_nvme_aer *aer = NULL;
629
630         while (!STAILQ_EMPTY(&sc->aer_list)) {
631                 aer = STAILQ_FIRST(&sc->aer_list);
632                 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
633                 free(aer);
634         }
635
636         pci_nvme_aer_init(sc);
637 }
638
639 static bool
640 pci_nvme_aer_available(struct pci_nvme_softc *sc)
641 {
642
643         return (!STAILQ_EMPTY(&sc->aer_list));
644 }
645
646 static bool
647 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
648 {
649         struct nvme_controller_data *cd = &sc->ctrldata;
650
651         /* AERL is a zero based value while aer_count is one's based */
652         return (sc->aer_count == (cd->aerl + 1));
653 }
654
655 /*
656  * Add an Async Event Request
657  *
658  * Stores an AER to be returned later if the Controller needs to notify the
659  * host of an event.
660  * Note that while the NVMe spec doesn't require Controllers to return AER's
661  * in order, this implementation does preserve the order.
662  */
663 static int
664 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
665 {
666         struct pci_nvme_aer *aer = NULL;
667
668         if (pci_nvme_aer_limit_reached(sc))
669                 return (-1);
670
671         aer = calloc(1, sizeof(struct pci_nvme_aer));
672         if (aer == NULL)
673                 return (-1);
674
675         sc->aer_count++;
676
677         /* Save the Command ID for use in the completion message */
678         aer->cid = cid;
679         STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
680
681         return (0);
682 }
683
684 /*
685  * Get an Async Event Request structure
686  *
687  * Returns a pointer to an AER previously submitted by the host or NULL if
688  * no AER's exist. Caller is responsible for freeing the returned struct.
689  */
690 static struct pci_nvme_aer *
691 pci_nvme_aer_get(struct pci_nvme_softc *sc)
692 {
693         struct pci_nvme_aer *aer = NULL;
694
695         aer = STAILQ_FIRST(&sc->aer_list);
696         if (aer != NULL) {
697                 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
698                 sc->aer_count--;
699         }
700         
701         return (aer);
702 }
703
704 static void
705 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
706 {
707         uint32_t i;
708
709         DPRINTF("%s", __func__);
710
711         sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
712             (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
713             (60 << NVME_CAP_LO_REG_TO_SHIFT);
714
715         sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
716
717         sc->regs.vs = 0x00010300;       /* NVMe v1.3 */
718
719         sc->regs.cc = 0;
720         sc->regs.csts = 0;
721
722         assert(sc->submit_queues != NULL);
723
724         for (i = 0; i < sc->num_squeues + 1; i++) {
725                 sc->submit_queues[i].qbase = NULL;
726                 sc->submit_queues[i].size = 0;
727                 sc->submit_queues[i].cqid = 0;
728                 sc->submit_queues[i].tail = 0;
729                 sc->submit_queues[i].head = 0;
730         }
731
732         assert(sc->compl_queues != NULL);
733
734         for (i = 0; i < sc->num_cqueues + 1; i++) {
735                 sc->compl_queues[i].qbase = NULL;
736                 sc->compl_queues[i].size = 0;
737                 sc->compl_queues[i].tail = 0;
738                 sc->compl_queues[i].head = 0;
739         }
740
741         sc->num_q_is_set = false;
742
743         pci_nvme_aer_destroy(sc);
744 }
745
746 static void
747 pci_nvme_reset(struct pci_nvme_softc *sc)
748 {
749         pthread_mutex_lock(&sc->mtx);
750         pci_nvme_reset_locked(sc);
751         pthread_mutex_unlock(&sc->mtx);
752 }
753
754 static void
755 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
756 {
757         uint16_t acqs, asqs;
758
759         DPRINTF("%s", __func__);
760
761         asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
762         sc->submit_queues[0].size = asqs;
763         sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
764                     sizeof(struct nvme_command) * asqs);
765
766         DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
767                 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
768
769         acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 
770             NVME_AQA_REG_ACQS_MASK) + 1;
771         sc->compl_queues[0].size = acqs;
772         sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
773                  sizeof(struct nvme_completion) * acqs);
774         sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
775
776         DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
777                 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
778 }
779
780 static int
781 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
782         size_t len, enum nvme_copy_dir dir)
783 {
784         uint8_t *p;
785         size_t bytes;
786
787         if (len > (8 * 1024)) {
788                 return (-1);
789         }
790
791         /* Copy from the start of prp1 to the end of the physical page */
792         bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
793         bytes = MIN(bytes, len);
794
795         p = vm_map_gpa(ctx, prp1, bytes);
796         if (p == NULL) {
797                 return (-1);
798         }
799
800         if (dir == NVME_COPY_TO_PRP)
801                 memcpy(p, b, bytes);
802         else
803                 memcpy(b, p, bytes);
804
805         b += bytes;
806
807         len -= bytes;
808         if (len == 0) {
809                 return (0);
810         }
811
812         len = MIN(len, PAGE_SIZE);
813
814         p = vm_map_gpa(ctx, prp2, len);
815         if (p == NULL) {
816                 return (-1);
817         }
818
819         if (dir == NVME_COPY_TO_PRP)
820                 memcpy(p, b, len);
821         else
822                 memcpy(b, p, len);
823
824         return (0);
825 }
826
827 /*
828  * Write a Completion Queue Entry update
829  *
830  * Write the completion and update the doorbell value
831  */
832 static void
833 pci_nvme_cq_update(struct pci_nvme_softc *sc,
834                 struct nvme_completion_queue *cq,
835                 uint32_t cdw0,
836                 uint16_t cid,
837                 uint16_t sqid,
838                 uint16_t status)
839 {
840         struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
841         struct nvme_completion *cqe;
842
843         assert(cq->qbase != NULL);
844
845         pthread_mutex_lock(&cq->mtx);
846
847         cqe = &cq->qbase[cq->tail];
848
849         /* Flip the phase bit */
850         status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
851
852         cqe->cdw0 = cdw0;
853         cqe->sqhd = sq->head;
854         cqe->sqid = sqid;
855         cqe->cid = cid;
856         cqe->status = status;
857
858         cq->tail++;
859         if (cq->tail >= cq->size) {
860                 cq->tail = 0;
861         }
862
863         pthread_mutex_unlock(&cq->mtx);
864 }
865
866 static int
867 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
868         struct nvme_completion* compl)
869 {
870         uint16_t qid = command->cdw10 & 0xffff;
871
872         DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
873         if (qid == 0 || qid > sc->num_squeues ||
874             (sc->submit_queues[qid].qbase == NULL)) {
875                 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
876                         __func__, qid, sc->num_squeues);
877                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
878                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
879                 return (1);
880         }
881
882         sc->submit_queues[qid].qbase = NULL;
883         sc->submit_queues[qid].cqid = 0;
884         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
885         return (1);
886 }
887
888 static int
889 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
890         struct nvme_completion* compl)
891 {
892         if (command->cdw11 & NVME_CMD_CDW11_PC) {
893                 uint16_t qid = command->cdw10 & 0xffff;
894                 struct nvme_submission_queue *nsq;
895
896                 if ((qid == 0) || (qid > sc->num_squeues) ||
897                     (sc->submit_queues[qid].qbase != NULL)) {
898                         WPRINTF("%s queue index %u > num_squeues %u",
899                                 __func__, qid, sc->num_squeues);
900                         pci_nvme_status_tc(&compl->status,
901                             NVME_SCT_COMMAND_SPECIFIC,
902                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
903                         return (1);
904                 }
905
906                 nsq = &sc->submit_queues[qid];
907                 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
908                 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
909                 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
910                         /*
911                          * Queues must specify at least two entries
912                          * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
913                          * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
914                          */
915                         pci_nvme_status_tc(&compl->status,
916                             NVME_SCT_COMMAND_SPECIFIC,
917                             NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
918                         return (1);
919                 }
920
921                 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
922                 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
923                         pci_nvme_status_tc(&compl->status,
924                             NVME_SCT_COMMAND_SPECIFIC,
925                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
926                         return (1);
927                 }
928
929                 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
930                         pci_nvme_status_tc(&compl->status,
931                             NVME_SCT_COMMAND_SPECIFIC,
932                             NVME_SC_COMPLETION_QUEUE_INVALID);
933                         return (1);
934                 }
935
936                 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
937
938                 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
939                               sizeof(struct nvme_command) * (size_t)nsq->size);
940
941                 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
942                         qid, nsq->size, nsq->qbase, nsq->cqid);
943
944                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
945
946                 DPRINTF("%s completed creating IOSQ qid %u",
947                          __func__, qid);
948         } else {
949                 /* 
950                  * Guest sent non-cont submission queue request.
951                  * This setting is unsupported by this emulation.
952                  */
953                 WPRINTF("%s unsupported non-contig (list-based) "
954                          "create i/o submission queue", __func__);
955
956                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
957         }
958         return (1);
959 }
960
961 static int
962 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
963         struct nvme_completion* compl)
964 {
965         uint16_t qid = command->cdw10 & 0xffff;
966         uint16_t sqid;
967
968         DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
969         if (qid == 0 || qid > sc->num_cqueues ||
970             (sc->compl_queues[qid].qbase == NULL)) {
971                 WPRINTF("%s queue index %u / num_cqueues %u",
972                         __func__, qid, sc->num_cqueues);
973                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
974                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
975                 return (1);
976         }
977
978         /* Deleting an Active CQ is an error */
979         for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
980                 if (sc->submit_queues[sqid].cqid == qid) {
981                         pci_nvme_status_tc(&compl->status,
982                             NVME_SCT_COMMAND_SPECIFIC,
983                             NVME_SC_INVALID_QUEUE_DELETION);
984                         return (1);
985                 }
986
987         sc->compl_queues[qid].qbase = NULL;
988         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
989         return (1);
990 }
991
992 static int
993 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
994         struct nvme_completion* compl)
995 {
996         struct nvme_completion_queue *ncq;
997         uint16_t qid = command->cdw10 & 0xffff;
998
999         /* Only support Physically Contiguous queues */
1000         if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1001                 WPRINTF("%s unsupported non-contig (list-based) "
1002                          "create i/o completion queue",
1003                          __func__);
1004
1005                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1006                 return (1);
1007         }
1008
1009         if ((qid == 0) || (qid > sc->num_cqueues) ||
1010             (sc->compl_queues[qid].qbase != NULL)) {
1011                 WPRINTF("%s queue index %u > num_cqueues %u",
1012                         __func__, qid, sc->num_cqueues);
1013                 pci_nvme_status_tc(&compl->status,
1014                     NVME_SCT_COMMAND_SPECIFIC,
1015                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
1016                 return (1);
1017         }
1018
1019         ncq = &sc->compl_queues[qid];
1020         ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1021         ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1022         if (ncq->intr_vec > (sc->max_queues + 1)) {
1023                 pci_nvme_status_tc(&compl->status,
1024                     NVME_SCT_COMMAND_SPECIFIC,
1025                     NVME_SC_INVALID_INTERRUPT_VECTOR);
1026                 return (1);
1027         }
1028
1029         ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1030         if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1031                 /*
1032                  * Queues must specify at least two entries
1033                  * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1034                  * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1035                  */
1036                 pci_nvme_status_tc(&compl->status,
1037                     NVME_SCT_COMMAND_SPECIFIC,
1038                     NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1039                 return (1);
1040         }
1041         ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1042                      command->prp1,
1043                      sizeof(struct nvme_command) * (size_t)ncq->size);
1044
1045         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1046
1047
1048         return (1);
1049 }
1050
1051 static int
1052 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1053         struct nvme_completion* compl)
1054 {
1055         uint32_t logsize;
1056         uint8_t logpage = command->cdw10 & 0xFF;
1057
1058         DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1059
1060         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1061
1062         /*
1063          * Command specifies the number of dwords to return in fields NUMDU
1064          * and NUMDL. This is a zero-based value.
1065          */
1066         logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1067         logsize *= sizeof(uint32_t);
1068
1069         switch (logpage) {
1070         case NVME_LOG_ERROR:
1071                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1072                     command->prp2, (uint8_t *)&sc->err_log,
1073                     MIN(logsize, sizeof(sc->err_log)),
1074                     NVME_COPY_TO_PRP);
1075                 break;
1076         case NVME_LOG_HEALTH_INFORMATION:
1077                 pthread_mutex_lock(&sc->mtx);
1078                 memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1079                     sizeof(sc->health_log.data_units_read));
1080                 memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1081                     sizeof(sc->health_log.data_units_written));
1082                 memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1083                     sizeof(sc->health_log.host_read_commands));
1084                 memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1085                     sizeof(sc->health_log.host_write_commands));
1086                 pthread_mutex_unlock(&sc->mtx);
1087
1088                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1089                     command->prp2, (uint8_t *)&sc->health_log,
1090                     MIN(logsize, sizeof(sc->health_log)),
1091                     NVME_COPY_TO_PRP);
1092                 break;
1093         case NVME_LOG_FIRMWARE_SLOT:
1094                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1095                     command->prp2, (uint8_t *)&sc->fw_log,
1096                     MIN(logsize, sizeof(sc->fw_log)),
1097                     NVME_COPY_TO_PRP);
1098                 break;
1099         default:
1100                 DPRINTF("%s get log page %x command not supported",
1101                         __func__, logpage);
1102
1103                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1104                     NVME_SC_INVALID_LOG_PAGE);
1105         }
1106
1107         return (1);
1108 }
1109
1110 static int
1111 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1112         struct nvme_completion* compl)
1113 {
1114         void *dest;
1115         uint16_t status;
1116
1117         DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1118                 command->cdw10 & 0xFF, command->nsid);
1119
1120         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1121
1122         switch (command->cdw10 & 0xFF) {
1123         case 0x00: /* return Identify Namespace data structure */
1124                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1125                     command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1126                     NVME_COPY_TO_PRP);
1127                 break;
1128         case 0x01: /* return Identify Controller data structure */
1129                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1130                     command->prp2, (uint8_t *)&sc->ctrldata,
1131                     sizeof(sc->ctrldata),
1132                     NVME_COPY_TO_PRP);
1133                 break;
1134         case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1135                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1136                                   sizeof(uint32_t) * 1024);
1137                 ((uint32_t *)dest)[0] = 1;
1138                 ((uint32_t *)dest)[1] = 0;
1139                 break;
1140         case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1141                 if (command->nsid != 1) {
1142                         pci_nvme_status_genc(&status,
1143                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1144                         break;
1145                 }
1146                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1147                                   sizeof(uint32_t) * 1024);
1148                 /* All bytes after the descriptor shall be zero */
1149                 bzero(dest, sizeof(uint32_t) * 1024);
1150
1151                 /* Return NIDT=1 (i.e. EUI64) descriptor */
1152                 ((uint8_t *)dest)[0] = 1;
1153                 ((uint8_t *)dest)[1] = sizeof(uint64_t);
1154                 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1155                 break;
1156         default:
1157                 DPRINTF("%s unsupported identify command requested 0x%x",
1158                          __func__, command->cdw10 & 0xFF);
1159                 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1160                 break;
1161         }
1162
1163         compl->status = status;
1164         return (1);
1165 }
1166
1167 static const char *
1168 nvme_fid_to_name(uint8_t fid)
1169 {
1170         const char *name;
1171
1172         switch (fid) {
1173         case NVME_FEAT_ARBITRATION:
1174                 name = "Arbitration";
1175                 break;
1176         case NVME_FEAT_POWER_MANAGEMENT:
1177                 name = "Power Management";
1178                 break;
1179         case NVME_FEAT_LBA_RANGE_TYPE:
1180                 name = "LBA Range Type";
1181                 break;
1182         case NVME_FEAT_TEMPERATURE_THRESHOLD:
1183                 name = "Temperature Threshold";
1184                 break;
1185         case NVME_FEAT_ERROR_RECOVERY:
1186                 name = "Error Recovery";
1187                 break;
1188         case NVME_FEAT_VOLATILE_WRITE_CACHE:
1189                 name = "Volatile Write Cache";
1190                 break;
1191         case NVME_FEAT_NUMBER_OF_QUEUES:
1192                 name = "Number of Queues";
1193                 break;
1194         case NVME_FEAT_INTERRUPT_COALESCING:
1195                 name = "Interrupt Coalescing";
1196                 break;
1197         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1198                 name = "Interrupt Vector Configuration";
1199                 break;
1200         case NVME_FEAT_WRITE_ATOMICITY:
1201                 name = "Write Atomicity Normal";
1202                 break;
1203         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1204                 name = "Asynchronous Event Configuration";
1205                 break;
1206         case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1207                 name = "Autonomous Power State Transition";
1208                 break;
1209         case NVME_FEAT_HOST_MEMORY_BUFFER:
1210                 name = "Host Memory Buffer";
1211                 break;
1212         case NVME_FEAT_TIMESTAMP:
1213                 name = "Timestamp";
1214                 break;
1215         case NVME_FEAT_KEEP_ALIVE_TIMER:
1216                 name = "Keep Alive Timer";
1217                 break;
1218         case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1219                 name = "Host Controlled Thermal Management";
1220                 break;
1221         case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1222                 name = "Non-Operation Power State Config";
1223                 break;
1224         case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1225                 name = "Read Recovery Level Config";
1226                 break;
1227         case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1228                 name = "Predictable Latency Mode Config";
1229                 break;
1230         case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1231                 name = "Predictable Latency Mode Window";
1232                 break;
1233         case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1234                 name = "LBA Status Information Report Interval";
1235                 break;
1236         case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1237                 name = "Host Behavior Support";
1238                 break;
1239         case NVME_FEAT_SANITIZE_CONFIG:
1240                 name = "Sanitize Config";
1241                 break;
1242         case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1243                 name = "Endurance Group Event Configuration";
1244                 break;
1245         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1246                 name = "Software Progress Marker";
1247                 break;
1248         case NVME_FEAT_HOST_IDENTIFIER:
1249                 name = "Host Identifier";
1250                 break;
1251         case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1252                 name = "Reservation Notification Mask";
1253                 break;
1254         case NVME_FEAT_RESERVATION_PERSISTENCE:
1255                 name = "Reservation Persistence";
1256                 break;
1257         case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1258                 name = "Namespace Write Protection Config";
1259                 break;
1260         default:
1261                 name = "Unknown";
1262                 break;
1263         }
1264
1265         return (name);
1266 }
1267
1268 static void
1269 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1270     struct nvme_feature_obj *feat,
1271     struct nvme_command *command,
1272     struct nvme_completion *compl)
1273 {
1274
1275         pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1276 }
1277
1278 static void
1279 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1280     struct nvme_feature_obj *feat,
1281     struct nvme_command *command,
1282     struct nvme_completion *compl)
1283 {
1284         uint16_t nqr;   /* Number of Queues Requested */
1285
1286         if (sc->num_q_is_set) {
1287                 WPRINTF("%s: Number of Queues already set", __func__);
1288                 pci_nvme_status_genc(&compl->status,
1289                     NVME_SC_COMMAND_SEQUENCE_ERROR);
1290                 return;
1291         }
1292
1293         nqr = command->cdw11 & 0xFFFF;
1294         if (nqr == 0xffff) {
1295                 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1296                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1297                 return;
1298         }
1299
1300         sc->num_squeues = ONE_BASED(nqr);
1301         if (sc->num_squeues > sc->max_queues) {
1302                 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1303                                         sc->max_queues);
1304                 sc->num_squeues = sc->max_queues;
1305         }
1306
1307         nqr = (command->cdw11 >> 16) & 0xFFFF;
1308         if (nqr == 0xffff) {
1309                 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1310                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1311                 return;
1312         }
1313
1314         sc->num_cqueues = ONE_BASED(nqr);
1315         if (sc->num_cqueues > sc->max_queues) {
1316                 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1317                                         sc->max_queues);
1318                 sc->num_cqueues = sc->max_queues;
1319         }
1320
1321         /* Patch the command value which will be saved on callback's return */
1322         command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1323         compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1324
1325         sc->num_q_is_set = true;
1326 }
1327
1328 static int
1329 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1330         struct nvme_completion *compl)
1331 {
1332         struct nvme_feature_obj *feat;
1333         uint32_t nsid = command->nsid;
1334         uint8_t fid = command->cdw10 & 0xFF;
1335
1336         DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1337
1338         if (fid >= NVME_FID_MAX) {
1339                 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1340                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1341                 return (1);
1342         }
1343         feat = &sc->feat[fid];
1344
1345         if (!feat->namespace_specific &&
1346             !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1347                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1348                     NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1349                 return (1);
1350         }
1351
1352         compl->cdw0 = 0;
1353         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1354
1355         if (feat->set)
1356                 feat->set(sc, feat, command, compl);
1357
1358         if (compl->status == NVME_SC_SUCCESS)
1359                 feat->cdw11 = command->cdw11;
1360
1361         return (0);
1362 }
1363
1364 static int
1365 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1366         struct nvme_completion* compl)
1367 {
1368         struct nvme_feature_obj *feat;
1369         uint8_t fid = command->cdw10 & 0xFF;
1370
1371         DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1372
1373         if (fid >= NVME_FID_MAX) {
1374                 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1375                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1376                 return (1);
1377         }
1378
1379         compl->cdw0 = 0;
1380         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1381
1382         feat = &sc->feat[fid];
1383         if (feat->get) {
1384                 feat->get(sc, feat, command, compl);
1385         }
1386
1387         if (compl->status == NVME_SC_SUCCESS) {
1388                 compl->cdw0 = feat->cdw11;
1389         }
1390
1391         return (0);
1392 }
1393
1394 static int
1395 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1396         struct nvme_completion* compl)
1397 {
1398         uint8_t ses, lbaf, pi;
1399
1400         /* Only supports Secure Erase Setting - User Data Erase */
1401         ses = (command->cdw10 >> 9) & 0x7;
1402         if (ses > 0x1) {
1403                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1404                 return (1);
1405         }
1406
1407         /* Only supports a single LBA Format */
1408         lbaf = command->cdw10 & 0xf;
1409         if (lbaf != 0) {
1410                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1411                     NVME_SC_INVALID_FORMAT);
1412                 return (1);
1413         }
1414
1415         /* Doesn't support Protection Infomation */
1416         pi = (command->cdw10 >> 5) & 0x7;
1417         if (pi != 0) {
1418                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1419                 return (1);
1420         }
1421
1422         if (sc->nvstore.type == NVME_STOR_RAM) {
1423                 if (sc->nvstore.ctx)
1424                         free(sc->nvstore.ctx);
1425                 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1426                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1427         } else {
1428                 struct pci_nvme_ioreq *req;
1429                 int err;
1430
1431                 req = pci_nvme_get_ioreq(sc);
1432                 if (req == NULL) {
1433                         pci_nvme_status_genc(&compl->status,
1434                             NVME_SC_INTERNAL_DEVICE_ERROR);
1435                         WPRINTF("%s: unable to allocate IO req", __func__);
1436                         return (1);
1437                 }
1438                 req->nvme_sq = &sc->submit_queues[0];
1439                 req->sqid = 0;
1440                 req->opc = command->opc;
1441                 req->cid = command->cid;
1442                 req->nsid = command->nsid;
1443
1444                 req->io_req.br_offset = 0;
1445                 req->io_req.br_resid = sc->nvstore.size;
1446                 req->io_req.br_callback = pci_nvme_io_done;
1447
1448                 err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1449                 if (err) {
1450                         pci_nvme_status_genc(&compl->status,
1451                             NVME_SC_INTERNAL_DEVICE_ERROR);
1452                         pci_nvme_release_ioreq(sc, req);
1453                 }
1454         }
1455
1456         return (1);
1457 }
1458
1459 static int
1460 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1461         struct nvme_completion* compl)
1462 {
1463         DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1464                 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1465
1466         /* TODO: search for the command ID and abort it */
1467
1468         compl->cdw0 = 1;
1469         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1470         return (1);
1471 }
1472
1473 static int
1474 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1475         struct nvme_command* command, struct nvme_completion* compl)
1476 {
1477         DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1478
1479         /* Don't exceed the Async Event Request Limit (AERL). */
1480         if (pci_nvme_aer_limit_reached(sc)) {
1481                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1482                                 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1483                 return (1);
1484         }
1485
1486         if (pci_nvme_aer_add(sc, command->cid)) {
1487                 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1488                                 NVME_SC_INTERNAL_DEVICE_ERROR);
1489                 return (1);
1490         }
1491
1492         /*
1493          * Raise events when they happen based on the Set Features cmd.
1494          * These events happen async, so only set completion successful if
1495          * there is an event reflective of the request to get event.
1496          */
1497         compl->status = NVME_NO_STATUS;
1498
1499         return (0);
1500 }
1501
1502 static void
1503 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1504 {
1505         struct nvme_completion compl;
1506         struct nvme_command *cmd;
1507         struct nvme_submission_queue *sq;
1508         struct nvme_completion_queue *cq;
1509         uint16_t sqhead;
1510
1511         DPRINTF("%s index %u", __func__, (uint32_t)value);
1512
1513         sq = &sc->submit_queues[0];
1514         cq = &sc->compl_queues[0];
1515
1516         pthread_mutex_lock(&sq->mtx);
1517
1518         sqhead = sq->head;
1519         DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1520         
1521         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1522                 cmd = &(sq->qbase)[sqhead];
1523                 compl.cdw0 = 0;
1524                 compl.status = 0;
1525
1526                 switch (cmd->opc) {
1527                 case NVME_OPC_DELETE_IO_SQ:
1528                         DPRINTF("%s command DELETE_IO_SQ", __func__);
1529                         nvme_opc_delete_io_sq(sc, cmd, &compl);
1530                         break;
1531                 case NVME_OPC_CREATE_IO_SQ:
1532                         DPRINTF("%s command CREATE_IO_SQ", __func__);
1533                         nvme_opc_create_io_sq(sc, cmd, &compl);
1534                         break;
1535                 case NVME_OPC_DELETE_IO_CQ:
1536                         DPRINTF("%s command DELETE_IO_CQ", __func__);
1537                         nvme_opc_delete_io_cq(sc, cmd, &compl);
1538                         break;
1539                 case NVME_OPC_CREATE_IO_CQ:
1540                         DPRINTF("%s command CREATE_IO_CQ", __func__);
1541                         nvme_opc_create_io_cq(sc, cmd, &compl);
1542                         break;
1543                 case NVME_OPC_GET_LOG_PAGE:
1544                         DPRINTF("%s command GET_LOG_PAGE", __func__);
1545                         nvme_opc_get_log_page(sc, cmd, &compl);
1546                         break;
1547                 case NVME_OPC_IDENTIFY:
1548                         DPRINTF("%s command IDENTIFY", __func__);
1549                         nvme_opc_identify(sc, cmd, &compl);
1550                         break;
1551                 case NVME_OPC_ABORT:
1552                         DPRINTF("%s command ABORT", __func__);
1553                         nvme_opc_abort(sc, cmd, &compl);
1554                         break;
1555                 case NVME_OPC_SET_FEATURES:
1556                         DPRINTF("%s command SET_FEATURES", __func__);
1557                         nvme_opc_set_features(sc, cmd, &compl);
1558                         break;
1559                 case NVME_OPC_GET_FEATURES:
1560                         DPRINTF("%s command GET_FEATURES", __func__);
1561                         nvme_opc_get_features(sc, cmd, &compl);
1562                         break;
1563                 case NVME_OPC_FIRMWARE_ACTIVATE:
1564                         DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
1565                         pci_nvme_status_tc(&compl.status,
1566                             NVME_SCT_COMMAND_SPECIFIC,
1567                             NVME_SC_INVALID_FIRMWARE_SLOT);
1568                         break;
1569                 case NVME_OPC_ASYNC_EVENT_REQUEST:
1570                         DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1571                         nvme_opc_async_event_req(sc, cmd, &compl);
1572                         break;
1573                 case NVME_OPC_FORMAT_NVM:
1574                         DPRINTF("%s command FORMAT_NVM", __func__);
1575                         if ((sc->ctrldata.oacs &
1576                             (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1577                                 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1578                         }
1579                         compl.status = NVME_NO_STATUS;
1580                         nvme_opc_format_nvm(sc, cmd, &compl);
1581                         break;
1582                 default:
1583                         DPRINTF("0x%x command is not implemented",
1584                             cmd->opc);
1585                         pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1586                 }
1587                 sqhead = (sqhead + 1) % sq->size;
1588
1589                 if (NVME_COMPLETION_VALID(compl)) {
1590                         pci_nvme_cq_update(sc, &sc->compl_queues[0],
1591                             compl.cdw0,
1592                             cmd->cid,
1593                             0,          /* SQID */
1594                             compl.status);
1595                 }
1596         }
1597
1598         DPRINTF("setting sqhead %u", sqhead);
1599         sq->head = sqhead;
1600
1601         if (cq->head != cq->tail)
1602                 pci_generate_msix(sc->nsc_pi, 0);
1603
1604         pthread_mutex_unlock(&sq->mtx);
1605 }
1606
1607 /*
1608  * Update the Write and Read statistics reported in SMART data
1609  *
1610  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
1611  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
1612  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
1613  */
1614 static void
1615 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
1616     size_t bytes, uint16_t status)
1617 {
1618
1619         pthread_mutex_lock(&sc->mtx);
1620         switch (opc) {
1621         case NVME_OPC_WRITE:
1622                 sc->write_commands++;
1623                 if (status != NVME_SC_SUCCESS)
1624                         break;
1625                 sc->write_dunits_remainder += (bytes / 512);
1626                 while (sc->write_dunits_remainder >= 1000) {
1627                         sc->write_data_units++;
1628                         sc->write_dunits_remainder -= 1000;
1629                 }
1630                 break;
1631         case NVME_OPC_READ:
1632                 sc->read_commands++;
1633                 if (status != NVME_SC_SUCCESS)
1634                         break;
1635                 sc->read_dunits_remainder += (bytes / 512);
1636                 while (sc->read_dunits_remainder >= 1000) {
1637                         sc->read_data_units++;
1638                         sc->read_dunits_remainder -= 1000;
1639                 }
1640                 break;
1641         default:
1642                 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
1643                 break;
1644         }
1645         pthread_mutex_unlock(&sc->mtx);
1646 }
1647
1648 /*
1649  * Check if the combination of Starting LBA (slba) and Number of Logical
1650  * Blocks (nlb) exceeds the range of the underlying storage.
1651  *
1652  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
1653  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
1654  * overflow.
1655  */
1656 static bool
1657 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
1658     uint32_t nlb)
1659 {
1660         size_t  offset, bytes;
1661
1662         /* Overflow check of multiplying Starting LBA by the sector size */
1663         if (slba >> (64 - nvstore->sectsz_bits))
1664                 return (true);
1665
1666         offset = slba << nvstore->sectsz_bits;
1667         bytes = nlb << nvstore->sectsz_bits;
1668
1669         /* Overflow check of Number of Logical Blocks */
1670         if ((nvstore->size - offset) < bytes)
1671                 return (true);
1672
1673         return (false);
1674 }
1675
1676 static int
1677 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1678         uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1679 {
1680         int iovidx;
1681
1682         if (req == NULL)
1683                 return (-1);
1684
1685         if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
1686                 return (-1);
1687         }
1688
1689         /* concatenate contig block-iovs to minimize number of iovs */
1690         if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1691                 iovidx = req->io_req.br_iovcnt - 1;
1692
1693                 req->io_req.br_iov[iovidx].iov_base =
1694                     paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1695                                      req->prev_gpaddr, size);
1696
1697                 req->prev_size += size;
1698                 req->io_req.br_resid += size;
1699
1700                 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1701         } else {
1702                 iovidx = req->io_req.br_iovcnt;
1703                 if (iovidx == 0) {
1704                         req->io_req.br_offset = lba;
1705                         req->io_req.br_resid = 0;
1706                         req->io_req.br_param = req;
1707                 }
1708
1709                 req->io_req.br_iov[iovidx].iov_base =
1710                     paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1711                                      gpaddr, size);
1712
1713                 req->io_req.br_iov[iovidx].iov_len = size;
1714
1715                 req->prev_gpaddr = gpaddr;
1716                 req->prev_size = size;
1717                 req->io_req.br_resid += size;
1718
1719                 req->io_req.br_iovcnt++;
1720         }
1721
1722         return (0);
1723 }
1724
1725 static void
1726 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1727         struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1728         uint32_t cdw0, uint16_t status)
1729 {
1730         struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1731
1732         DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1733                  __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1734                  NVME_STATUS_GET_SC(status));
1735
1736         pci_nvme_cq_update(sc, cq,
1737             0,          /* CDW0 */
1738             cid,
1739             sqid,
1740             status);
1741
1742         if (cq->head != cq->tail) {
1743                 if (cq->intr_en & NVME_CQ_INTEN) {
1744                         pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1745                 } else {
1746                         DPRINTF("%s: CQ%u interrupt disabled",
1747                                                 __func__, sq->cqid);
1748                 }
1749         }
1750 }
1751
1752 static void
1753 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1754 {
1755         req->sc = NULL;
1756         req->nvme_sq = NULL;
1757         req->sqid = 0;
1758
1759         pthread_mutex_lock(&sc->mtx);
1760
1761         STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1762         sc->pending_ios--;
1763
1764         /* when no more IO pending, can set to ready if device reset/enabled */
1765         if (sc->pending_ios == 0 &&
1766             NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1767                 sc->regs.csts |= NVME_CSTS_RDY;
1768
1769         pthread_mutex_unlock(&sc->mtx);
1770
1771         sem_post(&sc->iosemlock);
1772 }
1773
1774 static struct pci_nvme_ioreq *
1775 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1776 {
1777         struct pci_nvme_ioreq *req = NULL;;
1778
1779         sem_wait(&sc->iosemlock);
1780         pthread_mutex_lock(&sc->mtx);
1781
1782         req = STAILQ_FIRST(&sc->ioreqs_free);
1783         assert(req != NULL);
1784         STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1785
1786         req->sc = sc;
1787
1788         sc->pending_ios++;
1789
1790         pthread_mutex_unlock(&sc->mtx);
1791
1792         req->io_req.br_iovcnt = 0;
1793         req->io_req.br_offset = 0;
1794         req->io_req.br_resid = 0;
1795         req->io_req.br_param = req;
1796         req->prev_gpaddr = 0;
1797         req->prev_size = 0;
1798
1799         return req;
1800 }
1801
1802 static void
1803 pci_nvme_io_done(struct blockif_req *br, int err)
1804 {
1805         struct pci_nvme_ioreq *req = br->br_param;
1806         struct nvme_submission_queue *sq = req->nvme_sq;
1807         uint16_t code, status;
1808
1809         DPRINTF("%s error %d %s", __func__, err, strerror(err));
1810
1811         /* TODO return correct error */
1812         code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1813         pci_nvme_status_genc(&status, code);
1814
1815         pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1816         pci_nvme_stats_write_read_update(req->sc, req->opc,
1817             req->bytes, status);
1818         pci_nvme_release_ioreq(req->sc, req);
1819 }
1820
1821 /*
1822  * Implements the Flush command. The specification states:
1823  *    If a volatile write cache is not present, Flush commands complete
1824  *    successfully and have no effect
1825  * in the description of the Volatile Write Cache (VWC) field of the Identify
1826  * Controller data. Therefore, set status to Success if the command is
1827  * not supported (i.e. RAM or as indicated by the blockif).
1828  */
1829 static bool
1830 nvme_opc_flush(struct pci_nvme_softc *sc,
1831     struct nvme_command *cmd,
1832     struct pci_nvme_blockstore *nvstore,
1833     struct pci_nvme_ioreq *req,
1834     uint16_t *status)
1835 {
1836         bool pending = false;
1837
1838         if (nvstore->type == NVME_STOR_RAM) {
1839                 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1840         } else {
1841                 int err;
1842
1843                 req->io_req.br_callback = pci_nvme_io_done;
1844
1845                 err = blockif_flush(nvstore->ctx, &req->io_req);
1846                 switch (err) {
1847                 case 0:
1848                         pending = true;
1849                         break;
1850                 case EOPNOTSUPP:
1851                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1852                         break;
1853                 default:
1854                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1855                 }
1856         }
1857
1858         return (pending);
1859 }
1860
1861 static uint16_t
1862 nvme_write_read_ram(struct pci_nvme_softc *sc,
1863     struct pci_nvme_blockstore *nvstore,
1864     uint64_t prp1, uint64_t prp2,
1865     size_t offset, uint64_t bytes,
1866     bool is_write)
1867 {
1868         uint8_t *buf = nvstore->ctx;
1869         enum nvme_copy_dir dir;
1870         uint16_t status;
1871
1872         if (is_write)
1873                 dir = NVME_COPY_TO_PRP;
1874         else
1875                 dir = NVME_COPY_FROM_PRP;
1876
1877         if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
1878             buf + offset, bytes, dir))
1879                 pci_nvme_status_genc(&status,
1880                     NVME_SC_DATA_TRANSFER_ERROR);
1881         else
1882                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1883
1884         return (status);
1885 }
1886
1887 static uint16_t
1888 nvme_write_read_blockif(struct pci_nvme_softc *sc,
1889     struct pci_nvme_blockstore *nvstore,
1890     struct pci_nvme_ioreq *req,
1891     uint64_t prp1, uint64_t prp2,
1892     size_t offset, uint64_t bytes,
1893     bool is_write)
1894 {
1895         uint64_t size;
1896         int err;
1897         uint16_t status = NVME_NO_STATUS;
1898
1899         size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
1900         if (pci_nvme_append_iov_req(sc, req, prp1,
1901             size, is_write, offset)) {
1902                 pci_nvme_status_genc(&status,
1903                     NVME_SC_DATA_TRANSFER_ERROR);
1904                 goto out;
1905         }
1906
1907         offset += size;
1908         bytes  -= size;
1909
1910         if (bytes == 0) {
1911                 ;
1912         } else if (bytes <= PAGE_SIZE) {
1913                 size = bytes;
1914                 if (pci_nvme_append_iov_req(sc, req, prp2,
1915                     size, is_write, offset)) {
1916                         pci_nvme_status_genc(&status,
1917                             NVME_SC_DATA_TRANSFER_ERROR);
1918                         goto out;
1919                 }
1920         } else {
1921                 void *vmctx = sc->nsc_pi->pi_vmctx;
1922                 uint64_t *prp_list = &prp2;
1923                 uint64_t *last = prp_list;
1924
1925                 /* PRP2 is pointer to a physical region page list */
1926                 while (bytes) {
1927                         /* Last entry in list points to the next list */
1928                         if (prp_list == last) {
1929                                 uint64_t prp = *prp_list;
1930
1931                                 prp_list = paddr_guest2host(vmctx, prp,
1932                                     PAGE_SIZE - (prp % PAGE_SIZE));
1933                                 last = prp_list + (NVME_PRP2_ITEMS - 1);
1934                         }
1935
1936                         size = MIN(bytes, PAGE_SIZE);
1937
1938                         if (pci_nvme_append_iov_req(sc, req, *prp_list,
1939                             size, is_write, offset)) {
1940                                 pci_nvme_status_genc(&status,
1941                                     NVME_SC_DATA_TRANSFER_ERROR);
1942                                 goto out;
1943                         }
1944
1945                         offset += size;
1946                         bytes  -= size;
1947
1948                         prp_list++;
1949                 }
1950         }
1951         req->io_req.br_callback = pci_nvme_io_done;
1952         if (is_write)
1953                 err = blockif_write(nvstore->ctx, &req->io_req);
1954         else
1955                 err = blockif_read(nvstore->ctx, &req->io_req);
1956
1957         if (err)
1958                 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
1959 out:
1960         return (status);
1961 }
1962
1963 static bool
1964 nvme_opc_write_read(struct pci_nvme_softc *sc,
1965     struct nvme_command *cmd,
1966     struct pci_nvme_blockstore *nvstore,
1967     struct pci_nvme_ioreq *req,
1968     uint16_t *status)
1969 {
1970         uint64_t lba, nblocks, bytes;
1971         size_t offset;
1972         bool is_write = cmd->opc == NVME_OPC_WRITE;
1973         bool pending = false;
1974
1975         lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1976         nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1977         if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
1978                 WPRINTF("%s command would exceed LBA range", __func__);
1979                 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
1980                 goto out;
1981         }
1982
1983         bytes  = nblocks << nvstore->sectsz_bits;
1984         if (bytes > NVME_MAX_DATA_SIZE) {
1985                 WPRINTF("%s command would exceed MDTS", __func__);
1986                 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
1987                 goto out;
1988         }
1989
1990         offset = lba << nvstore->sectsz_bits;
1991
1992         req->bytes = bytes;
1993         req->io_req.br_offset = lba;
1994
1995         /* PRP bits 1:0 must be zero */
1996         cmd->prp1 &= ~0x3UL;
1997         cmd->prp2 &= ~0x3UL;
1998
1999         if (nvstore->type == NVME_STOR_RAM) {
2000                 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2001                     cmd->prp2, offset, bytes, is_write);
2002         } else {
2003                 *status = nvme_write_read_blockif(sc, nvstore, req,
2004                     cmd->prp1, cmd->prp2, offset, bytes, is_write);
2005
2006                 if (*status == NVME_NO_STATUS)
2007                         pending = true;
2008         }
2009 out:
2010         if (!pending)
2011                 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2012
2013         return (pending);
2014 }
2015
2016 static void
2017 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2018 {
2019         struct pci_nvme_ioreq *req = br->br_param;
2020         struct pci_nvme_softc *sc = req->sc;
2021         bool done = true;
2022         uint16_t status;
2023
2024         if (err) {
2025                 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2026         } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2027                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2028         } else {
2029                 struct iovec *iov = req->io_req.br_iov;
2030
2031                 req->prev_gpaddr++;
2032                 iov += req->prev_gpaddr;
2033
2034                 /* The iov_* values already include the sector size */
2035                 req->io_req.br_offset = (off_t)iov->iov_base;
2036                 req->io_req.br_resid = iov->iov_len;
2037                 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2038                         pci_nvme_status_genc(&status,
2039                             NVME_SC_INTERNAL_DEVICE_ERROR);
2040                 } else
2041                         done = false;
2042         }
2043
2044         if (done) {
2045                 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2046                     req->cid, 0, status);
2047                 pci_nvme_release_ioreq(sc, req);
2048         }
2049 }
2050
2051 static bool
2052 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2053     struct nvme_command *cmd,
2054     struct pci_nvme_blockstore *nvstore,
2055     struct pci_nvme_ioreq *req,
2056     uint16_t *status)
2057 {
2058         int err;
2059         bool pending = false;
2060
2061         if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2062                 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2063                 goto out;
2064         }
2065
2066         if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2067                 struct nvme_dsm_range *range;
2068                 size_t offset, bytes;
2069                 uint32_t nr, r;
2070                 int sectsz_bits = sc->nvstore.sectsz_bits;
2071
2072                 /*
2073                  * DSM calls are advisory only, and compliant controllers
2074                  * may choose to take no actions (i.e. return Success).
2075                  */
2076                 if (!nvstore->deallocate) {
2077                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2078                         goto out;
2079                 }
2080
2081                 if (req == NULL) {
2082                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2083                         goto out;
2084                 }
2085
2086                 /* copy locally because a range entry could straddle PRPs */
2087                 range = calloc(1, NVME_MAX_DSM_TRIM);
2088                 if (range == NULL) {
2089                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2090                         goto out;
2091                 }
2092                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2093                     (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2094
2095                 if (pci_nvme_out_of_range(nvstore, range[0].starting_lba,
2096                     range[0].length)) {
2097                         pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2098                         goto out;
2099                 }
2100                 offset = range[0].starting_lba << sectsz_bits;
2101                 bytes = range[0].length << sectsz_bits;
2102
2103                 /*
2104                  * If the request is for more than a single range, store
2105                  * the ranges in the br_iov. Optimize for the common case
2106                  * of a single range.
2107                  *
2108                  * Note that NVMe Number of Ranges is a zero based value
2109                  */
2110                 nr = cmd->cdw10 & 0xff;
2111
2112                 req->io_req.br_iovcnt = 0;
2113                 req->io_req.br_offset = offset;
2114                 req->io_req.br_resid = bytes;
2115
2116                 if (nr == 0) {
2117                         req->io_req.br_callback = pci_nvme_io_done;
2118                 } else {
2119                         struct iovec *iov = req->io_req.br_iov;
2120
2121                         for (r = 0; r <= nr; r++) {
2122                                 if (pci_nvme_out_of_range(nvstore, range[r].starting_lba,
2123                                     range[r].length)) {
2124                                         pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2125                                         goto out;
2126                                 }
2127                                 offset = range[r].starting_lba << sectsz_bits;
2128                                 bytes = range[r].length << sectsz_bits;
2129                                 if ((nvstore->size - offset) < bytes) {
2130                                         pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2131                                         goto out;
2132                                 }
2133                                 iov[r].iov_base = (void *)offset;
2134                                 iov[r].iov_len = bytes;
2135                         }
2136                         req->io_req.br_callback = pci_nvme_dealloc_sm;
2137
2138                         /*
2139                          * Use prev_gpaddr to track the current entry and
2140                          * prev_size to track the number of entries
2141                          */
2142                         req->prev_gpaddr = 0;
2143                         req->prev_size = r;
2144                 }
2145
2146                 err = blockif_delete(nvstore->ctx, &req->io_req);
2147                 if (err)
2148                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2149                 else
2150                         pending = true;
2151
2152                 free(range);
2153         }
2154 out:
2155         return (pending);
2156 }
2157
2158 static void
2159 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2160 {
2161         struct nvme_submission_queue *sq;
2162         uint16_t status;
2163         uint16_t sqhead;
2164
2165         /* handle all submissions up to sq->tail index */
2166         sq = &sc->submit_queues[idx];
2167
2168         pthread_mutex_lock(&sq->mtx);
2169
2170         sqhead = sq->head;
2171         DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2172                  idx, sqhead, sq->tail, sq->qbase);
2173
2174         while (sqhead != atomic_load_acq_short(&sq->tail)) {
2175                 struct nvme_command *cmd;
2176                 struct pci_nvme_ioreq *req;
2177                 uint32_t nsid;
2178                 bool pending;
2179
2180                 pending = false;
2181                 req = NULL;
2182                 status = 0;
2183
2184                 cmd = &sq->qbase[sqhead];
2185                 sqhead = (sqhead + 1) % sq->size;
2186
2187                 nsid = le32toh(cmd->nsid);
2188                 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2189                         pci_nvme_status_genc(&status,
2190                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2191                         status |=
2192                             NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2193                         goto complete;
2194                 }
2195
2196                 req = pci_nvme_get_ioreq(sc);
2197                 if (req == NULL) {
2198                         pci_nvme_status_genc(&status,
2199                             NVME_SC_INTERNAL_DEVICE_ERROR);
2200                         WPRINTF("%s: unable to allocate IO req", __func__);
2201                         goto complete;
2202                 }
2203                 req->nvme_sq = sq;
2204                 req->sqid = idx;
2205                 req->opc = cmd->opc;
2206                 req->cid = cmd->cid;
2207                 req->nsid = cmd->nsid;
2208
2209                 switch (cmd->opc) {
2210                 case NVME_OPC_FLUSH:
2211                         pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2212                             req, &status);
2213                         break;
2214                 case NVME_OPC_WRITE:
2215                 case NVME_OPC_READ:
2216                         pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2217                             req, &status);
2218                         break;
2219                 case NVME_OPC_WRITE_ZEROES:
2220                         /* TODO: write zeroes
2221                         WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2222                                 __func__, lba, cmd->cdw12 & 0xFFFF); */
2223                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2224                         break;
2225                 case NVME_OPC_DATASET_MANAGEMENT:
2226                         pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2227                             req, &status);
2228                         break;
2229                 default:
2230                         WPRINTF("%s unhandled io command 0x%x",
2231                             __func__, cmd->opc);
2232                         pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2233                 }
2234 complete:
2235                 if (!pending) {
2236                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2237                             status);
2238                         if (req != NULL)
2239                                 pci_nvme_release_ioreq(sc, req);
2240                 }
2241         }
2242
2243         sq->head = sqhead;
2244
2245         pthread_mutex_unlock(&sq->mtx);
2246 }
2247
2248 static void
2249 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2250         uint64_t idx, int is_sq, uint64_t value)
2251 {
2252         DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2253                 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2254
2255         if (is_sq) {
2256                 if (idx > sc->num_squeues) {
2257                         WPRINTF("%s queue index %lu overflow from "
2258                                  "guest (max %u)",
2259                                  __func__, idx, sc->num_squeues);
2260                         return;
2261                 }
2262
2263                 atomic_store_short(&sc->submit_queues[idx].tail,
2264                                    (uint16_t)value);
2265
2266                 if (idx == 0) {
2267                         pci_nvme_handle_admin_cmd(sc, value);
2268                 } else {
2269                         /* submission queue; handle new entries in SQ */
2270                         if (idx > sc->num_squeues) {
2271                                 WPRINTF("%s SQ index %lu overflow from "
2272                                          "guest (max %u)",
2273                                          __func__, idx, sc->num_squeues);
2274                                 return;
2275                         }
2276                         pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2277                 }
2278         } else {
2279                 if (idx > sc->num_cqueues) {
2280                         WPRINTF("%s queue index %lu overflow from "
2281                                  "guest (max %u)",
2282                                  __func__, idx, sc->num_cqueues);
2283                         return;
2284                 }
2285
2286                 atomic_store_short(&sc->compl_queues[idx].head,
2287                                 (uint16_t)value);
2288         }
2289 }
2290
2291 static void
2292 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2293 {
2294         const char *s = iswrite ? "WRITE" : "READ";
2295
2296         switch (offset) {
2297         case NVME_CR_CAP_LOW:
2298                 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2299                 break;
2300         case NVME_CR_CAP_HI:
2301                 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2302                 break;
2303         case NVME_CR_VS:
2304                 DPRINTF("%s %s NVME_CR_VS", func, s);
2305                 break;
2306         case NVME_CR_INTMS:
2307                 DPRINTF("%s %s NVME_CR_INTMS", func, s);
2308                 break;
2309         case NVME_CR_INTMC:
2310                 DPRINTF("%s %s NVME_CR_INTMC", func, s);
2311                 break;
2312         case NVME_CR_CC:
2313                 DPRINTF("%s %s NVME_CR_CC", func, s);
2314                 break;
2315         case NVME_CR_CSTS:
2316                 DPRINTF("%s %s NVME_CR_CSTS", func, s);
2317                 break;
2318         case NVME_CR_NSSR:
2319                 DPRINTF("%s %s NVME_CR_NSSR", func, s);
2320                 break;
2321         case NVME_CR_AQA:
2322                 DPRINTF("%s %s NVME_CR_AQA", func, s);
2323                 break;
2324         case NVME_CR_ASQ_LOW:
2325                 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2326                 break;
2327         case NVME_CR_ASQ_HI:
2328                 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2329                 break;
2330         case NVME_CR_ACQ_LOW:
2331                 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2332                 break;
2333         case NVME_CR_ACQ_HI:
2334                 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2335                 break;
2336         default:
2337                 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2338         }
2339
2340 }
2341
2342 static void
2343 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2344         uint64_t offset, int size, uint64_t value)
2345 {
2346         uint32_t ccreg;
2347
2348         if (offset >= NVME_DOORBELL_OFFSET) {
2349                 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2350                 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2351                 int is_sq = (belloffset % 8) < 4;
2352
2353                 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2354                         WPRINTF("guest attempted an overflow write offset "
2355                                  "0x%lx, val 0x%lx in %s",
2356                                  offset, value, __func__);
2357                         return;
2358                 }
2359
2360                 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2361                 return;
2362         }
2363
2364         DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2365                 offset, size, value);
2366
2367         if (size != 4) {
2368                 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2369                          "val 0x%lx) to bar0 in %s",
2370                          size, offset, value, __func__);
2371                 /* TODO: shutdown device */
2372                 return;
2373         }
2374
2375         pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2376
2377         pthread_mutex_lock(&sc->mtx);
2378
2379         switch (offset) {
2380         case NVME_CR_CAP_LOW:
2381         case NVME_CR_CAP_HI:
2382                 /* readonly */
2383                 break;
2384         case NVME_CR_VS:
2385                 /* readonly */
2386                 break;
2387         case NVME_CR_INTMS:
2388                 /* MSI-X, so ignore */
2389                 break;
2390         case NVME_CR_INTMC:
2391                 /* MSI-X, so ignore */
2392                 break;
2393         case NVME_CR_CC:
2394                 ccreg = (uint32_t)value;
2395
2396                 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2397                          "iocqes %u",
2398                         __func__,
2399                          NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2400                          NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2401                          NVME_CC_GET_IOCQES(ccreg));
2402
2403                 if (NVME_CC_GET_SHN(ccreg)) {
2404                         /* perform shutdown - flush out data to backend */
2405                         sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2406                             NVME_CSTS_REG_SHST_SHIFT);
2407                         sc->regs.csts |= NVME_SHST_COMPLETE <<
2408                             NVME_CSTS_REG_SHST_SHIFT;
2409                 }
2410                 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2411                         if (NVME_CC_GET_EN(ccreg) == 0)
2412                                 /* transition 1-> causes controller reset */
2413                                 pci_nvme_reset_locked(sc);
2414                         else
2415                                 pci_nvme_init_controller(ctx, sc);
2416                 }
2417
2418                 /* Insert the iocqes, iosqes and en bits from the write */
2419                 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2420                 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2421                 if (NVME_CC_GET_EN(ccreg) == 0) {
2422                         /* Insert the ams, mps and css bit fields */
2423                         sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2424                         sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2425                         sc->regs.csts &= ~NVME_CSTS_RDY;
2426                 } else if (sc->pending_ios == 0) {
2427                         sc->regs.csts |= NVME_CSTS_RDY;
2428                 }
2429                 break;
2430         case NVME_CR_CSTS:
2431                 break;
2432         case NVME_CR_NSSR:
2433                 /* ignore writes; don't support subsystem reset */
2434                 break;
2435         case NVME_CR_AQA:
2436                 sc->regs.aqa = (uint32_t)value;
2437                 break;
2438         case NVME_CR_ASQ_LOW:
2439                 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2440                                (0xFFFFF000 & value);
2441                 break;
2442         case NVME_CR_ASQ_HI:
2443                 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2444                                (value << 32);
2445                 break;
2446         case NVME_CR_ACQ_LOW:
2447                 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2448                                (0xFFFFF000 & value);
2449                 break;
2450         case NVME_CR_ACQ_HI:
2451                 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2452                                (value << 32);
2453                 break;
2454         default:
2455                 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2456                          __func__, offset, value, size);
2457         }
2458         pthread_mutex_unlock(&sc->mtx);
2459 }
2460
2461 static void
2462 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2463                 int baridx, uint64_t offset, int size, uint64_t value)
2464 {
2465         struct pci_nvme_softc* sc = pi->pi_arg;
2466
2467         if (baridx == pci_msix_table_bar(pi) ||
2468             baridx == pci_msix_pba_bar(pi)) {
2469                 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2470                          " value 0x%lx", baridx, offset, size, value);
2471
2472                 pci_emul_msix_twrite(pi, offset, size, value);
2473                 return;
2474         }
2475
2476         switch (baridx) {
2477         case 0:
2478                 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2479                 break;
2480
2481         default:
2482                 DPRINTF("%s unknown baridx %d, val 0x%lx",
2483                          __func__, baridx, value);
2484         }
2485 }
2486
2487 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2488         uint64_t offset, int size)
2489 {
2490         uint64_t value;
2491
2492         pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2493
2494         if (offset < NVME_DOORBELL_OFFSET) {
2495                 void *p = &(sc->regs);
2496                 pthread_mutex_lock(&sc->mtx);
2497                 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2498                 pthread_mutex_unlock(&sc->mtx);
2499         } else {
2500                 value = 0;
2501                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2502         }
2503
2504         switch (size) {
2505         case 1:
2506                 value &= 0xFF;
2507                 break;
2508         case 2:
2509                 value &= 0xFFFF;
2510                 break;
2511         case 4:
2512                 value &= 0xFFFFFFFF;
2513                 break;
2514         }
2515
2516         DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
2517                  offset, size, (uint32_t)value);
2518
2519         return (value);
2520 }
2521
2522
2523
2524 static uint64_t
2525 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2526     uint64_t offset, int size)
2527 {
2528         struct pci_nvme_softc* sc = pi->pi_arg;
2529
2530         if (baridx == pci_msix_table_bar(pi) ||
2531             baridx == pci_msix_pba_bar(pi)) {
2532                 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2533                         baridx, offset, size);
2534
2535                 return pci_emul_msix_tread(pi, offset, size);
2536         }
2537
2538         switch (baridx) {
2539         case 0:
2540                 return pci_nvme_read_bar_0(sc, offset, size);
2541
2542         default:
2543                 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2544         }
2545
2546         return (0);
2547 }
2548
2549
2550 static int
2551 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2552 {
2553         char bident[sizeof("XX:X:X")];
2554         char    *uopt, *xopts, *config;
2555         uint32_t sectsz;
2556         int optidx;
2557
2558         sc->max_queues = NVME_QUEUES;
2559         sc->max_qentries = NVME_MAX_QENTRIES;
2560         sc->ioslots = NVME_IOSLOTS;
2561         sc->num_squeues = sc->max_queues;
2562         sc->num_cqueues = sc->max_queues;
2563         sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2564         sectsz = 0;
2565
2566         uopt = strdup(opts);
2567         optidx = 0;
2568         snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2569                  "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2570         for (xopts = strtok(uopt, ",");
2571              xopts != NULL;
2572              xopts = strtok(NULL, ",")) {
2573
2574                 if ((config = strchr(xopts, '=')) != NULL)
2575                         *config++ = '\0';
2576
2577                 if (!strcmp("maxq", xopts)) {
2578                         sc->max_queues = atoi(config);
2579                 } else if (!strcmp("qsz", xopts)) {
2580                         sc->max_qentries = atoi(config);
2581                 } else if (!strcmp("ioslots", xopts)) {
2582                         sc->ioslots = atoi(config);
2583                 } else if (!strcmp("sectsz", xopts)) {
2584                         sectsz = atoi(config);
2585                 } else if (!strcmp("ser", xopts)) {
2586                         /*
2587                          * This field indicates the Product Serial Number in
2588                          * 7-bit ASCII, unused bytes should be space characters.
2589                          * Ref: NVMe v1.3c.
2590                          */
2591                         cpywithpad((char *)sc->ctrldata.sn,
2592                                    sizeof(sc->ctrldata.sn), config, ' ');
2593                 } else if (!strcmp("ram", xopts)) {
2594                         uint64_t sz = strtoull(&xopts[4], NULL, 10);
2595
2596                         sc->nvstore.type = NVME_STOR_RAM;
2597                         sc->nvstore.size = sz * 1024 * 1024;
2598                         sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2599                         sc->nvstore.sectsz = 4096;
2600                         sc->nvstore.sectsz_bits = 12;
2601                         if (sc->nvstore.ctx == NULL) {
2602                                 perror("Unable to allocate RAM");
2603                                 free(uopt);
2604                                 return (-1);
2605                         }
2606                 } else if (!strcmp("eui64", xopts)) {
2607                         sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2608                 } else if (!strcmp("dsm", xopts)) {
2609                         if (!strcmp("auto", config))
2610                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2611                         else if (!strcmp("enable", config))
2612                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2613                         else if (!strcmp("disable", config))
2614                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2615                 } else if (optidx == 0) {
2616                         snprintf(bident, sizeof(bident), "%d:%d",
2617                                  sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2618                         sc->nvstore.ctx = blockif_open(xopts, bident);
2619                         if (sc->nvstore.ctx == NULL) {
2620                                 perror("Could not open backing file");
2621                                 free(uopt);
2622                                 return (-1);
2623                         }
2624                         sc->nvstore.type = NVME_STOR_BLOCKIF;
2625                         sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2626                 } else {
2627                         EPRINTLN("Invalid option %s", xopts);
2628                         free(uopt);
2629                         return (-1);
2630                 }
2631
2632                 optidx++;
2633         }
2634         free(uopt);
2635
2636         if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2637                 EPRINTLN("backing store not specified");
2638                 return (-1);
2639         }
2640         if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2641                 sc->nvstore.sectsz = sectsz;
2642         else if (sc->nvstore.type != NVME_STOR_RAM)
2643                 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2644         for (sc->nvstore.sectsz_bits = 9;
2645              (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2646              sc->nvstore.sectsz_bits++);
2647
2648         if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2649                 sc->max_queues = NVME_QUEUES;
2650
2651         if (sc->max_qentries <= 0) {
2652                 EPRINTLN("Invalid qsz option");
2653                 return (-1);
2654         }
2655         if (sc->ioslots <= 0) {
2656                 EPRINTLN("Invalid ioslots option");
2657                 return (-1);
2658         }
2659
2660         return (0);
2661 }
2662
2663 static int
2664 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2665 {
2666         struct pci_nvme_softc *sc;
2667         uint32_t pci_membar_sz;
2668         int     error;
2669
2670         error = 0;
2671
2672         sc = calloc(1, sizeof(struct pci_nvme_softc));
2673         pi->pi_arg = sc;
2674         sc->nsc_pi = pi;
2675
2676         error = pci_nvme_parse_opts(sc, opts);
2677         if (error < 0)
2678                 goto done;
2679         else
2680                 error = 0;
2681
2682         STAILQ_INIT(&sc->ioreqs_free);
2683         sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2684         for (int i = 0; i < sc->ioslots; i++) {
2685                 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2686         }
2687
2688         pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2689         pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2690         pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2691         pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2692         pci_set_cfgdata8(pi, PCIR_PROGIF,
2693                          PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2694
2695         /*
2696          * Allocate size of NVMe registers + doorbell space for all queues.
2697          *
2698          * The specification requires a minimum memory I/O window size of 16K.
2699          * The Windows driver will refuse to start a device with a smaller
2700          * window.
2701          */
2702         pci_membar_sz = sizeof(struct nvme_registers) +
2703             2 * sizeof(uint32_t) * (sc->max_queues + 1);
2704         pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2705
2706         DPRINTF("nvme membar size: %u", pci_membar_sz);
2707
2708         error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2709         if (error) {
2710                 WPRINTF("%s pci alloc mem bar failed", __func__);
2711                 goto done;
2712         }
2713
2714         error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2715         if (error) {
2716                 WPRINTF("%s pci add msixcap failed", __func__);
2717                 goto done;
2718         }
2719
2720         error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2721         if (error) {
2722                 WPRINTF("%s pci add Express capability failed", __func__);
2723                 goto done;
2724         }
2725
2726         pthread_mutex_init(&sc->mtx, NULL);
2727         sem_init(&sc->iosemlock, 0, sc->ioslots);
2728
2729         pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2730         /*
2731          * Controller data depends on Namespace data so initialize Namespace
2732          * data first.
2733          */
2734         pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2735         pci_nvme_init_ctrldata(sc);
2736         pci_nvme_init_logpages(sc);
2737         pci_nvme_init_features(sc);
2738
2739         pci_nvme_aer_init(sc);
2740
2741         pci_nvme_reset(sc);
2742
2743         pci_lintr_request(pi);
2744
2745 done:
2746         return (error);
2747 }
2748
2749
2750 struct pci_devemu pci_de_nvme = {
2751         .pe_emu =       "nvme",
2752         .pe_init =      pci_nvme_init,
2753         .pe_barwrite =  pci_nvme_write,
2754         .pe_barread =   pci_nvme_read
2755 };
2756 PCI_EMUL_SET(pci_de_nvme);