]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/bhyve/pci_nvme.c
bhyve: NVMe set nominal health values
[FreeBSD/FreeBSD.git] / usr.sbin / bhyve / pci_nvme.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov 
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65
66 #include <assert.h>
67 #include <pthread.h>
68 #include <semaphore.h>
69 #include <stdbool.h>
70 #include <stddef.h>
71 #include <stdint.h>
72 #include <stdio.h>
73 #include <stdlib.h>
74 #include <string.h>
75
76 #include <machine/atomic.h>
77 #include <machine/vmm.h>
78 #include <vmmapi.h>
79
80 #include <dev/nvme/nvme.h>
81
82 #include "bhyverun.h"
83 #include "block_if.h"
84 #include "debug.h"
85 #include "pci_emul.h"
86
87
88 static int nvme_debug = 0;
89 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
90 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
91
92 /* defaults; can be overridden */
93 #define NVME_MSIX_BAR           4
94
95 #define NVME_IOSLOTS            8
96
97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
98 #define NVME_MMIO_SPACE_MIN     (1 << 14)
99
100 #define NVME_QUEUES             16
101 #define NVME_MAX_QENTRIES       2048
102 /* Memory Page size Minimum reported in CAP register */
103 #define NVME_MPSMIN             0
104 /* MPSMIN converted to bytes */
105 #define NVME_MPSMIN_BYTES       (1 << (12 + NVME_MPSMIN))
106
107 #define NVME_PRP2_ITEMS         (PAGE_SIZE/sizeof(uint64_t))
108 #define NVME_MDTS               9
109 /* Note the + 1 allows for the initial descriptor to not be page aligned */
110 #define NVME_MAX_IOVEC          ((1 << NVME_MDTS) + 1)
111 #define NVME_MAX_DATA_SIZE      ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
112
113 /* This is a synthetic status code to indicate there is no status */
114 #define NVME_NO_STATUS          0xffff
115 #define NVME_COMPLETION_VALID(c)        ((c).status != NVME_NO_STATUS)
116
117 /* helpers */
118
119 /* Convert a zero-based value into a one-based value */
120 #define ONE_BASED(zero)         ((zero) + 1)
121 /* Convert a one-based value into a zero-based value */
122 #define ZERO_BASED(one)         ((one)  - 1)
123
124 /* Encode number of SQ's and CQ's for Set/Get Features */
125 #define NVME_FEATURE_NUM_QUEUES(sc) \
126         (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
127         (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
128
129 #define NVME_DOORBELL_OFFSET    offsetof(struct nvme_registers, doorbell)
130
131 enum nvme_controller_register_offsets {
132         NVME_CR_CAP_LOW = 0x00,
133         NVME_CR_CAP_HI  = 0x04,
134         NVME_CR_VS      = 0x08,
135         NVME_CR_INTMS   = 0x0c,
136         NVME_CR_INTMC   = 0x10,
137         NVME_CR_CC      = 0x14,
138         NVME_CR_CSTS    = 0x1c,
139         NVME_CR_NSSR    = 0x20,
140         NVME_CR_AQA     = 0x24,
141         NVME_CR_ASQ_LOW = 0x28,
142         NVME_CR_ASQ_HI  = 0x2c,
143         NVME_CR_ACQ_LOW = 0x30,
144         NVME_CR_ACQ_HI  = 0x34,
145 };
146
147 enum nvme_cmd_cdw11 {
148         NVME_CMD_CDW11_PC  = 0x0001,
149         NVME_CMD_CDW11_IEN = 0x0002,
150         NVME_CMD_CDW11_IV  = 0xFFFF0000,
151 };
152
153 enum nvme_copy_dir {
154         NVME_COPY_TO_PRP,
155         NVME_COPY_FROM_PRP,
156 };
157
158 #define NVME_CQ_INTEN   0x01
159 #define NVME_CQ_INTCOAL 0x02
160
161 struct nvme_completion_queue {
162         struct nvme_completion *qbase;
163         pthread_mutex_t mtx;
164         uint32_t        size;
165         uint16_t        tail; /* nvme progress */
166         uint16_t        head; /* guest progress */
167         uint16_t        intr_vec;
168         uint32_t        intr_en;
169 };
170
171 struct nvme_submission_queue {
172         struct nvme_command *qbase;
173         pthread_mutex_t mtx;
174         uint32_t        size;
175         uint16_t        head; /* nvme progress */
176         uint16_t        tail; /* guest progress */
177         uint16_t        cqid; /* completion queue id */
178         int             qpriority;
179 };
180
181 enum nvme_storage_type {
182         NVME_STOR_BLOCKIF = 0,
183         NVME_STOR_RAM = 1,
184 };
185
186 struct pci_nvme_blockstore {
187         enum nvme_storage_type type;
188         void            *ctx;
189         uint64_t        size;
190         uint32_t        sectsz;
191         uint32_t        sectsz_bits;
192         uint64_t        eui64;
193         uint32_t        deallocate:1;
194 };
195
196 /*
197  * Calculate the number of additional page descriptors for guest IO requests
198  * based on the advertised Max Data Transfer (MDTS) and given the number of
199  * default iovec's in a struct blockif_req.
200  *
201  * Note the + 1 allows for the initial descriptor to not be page aligned.
202  */
203 #define MDTS_PAD_SIZE \
204         NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
205         NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
206         0
207
208 struct pci_nvme_ioreq {
209         struct pci_nvme_softc *sc;
210         STAILQ_ENTRY(pci_nvme_ioreq) link;
211         struct nvme_submission_queue *nvme_sq;
212         uint16_t        sqid;
213
214         /* command information */
215         uint16_t        opc;
216         uint16_t        cid;
217         uint32_t        nsid;
218
219         uint64_t        prev_gpaddr;
220         size_t          prev_size;
221         size_t          bytes;
222
223         struct blockif_req io_req;
224
225         struct iovec    iovpadding[MDTS_PAD_SIZE];
226 };
227
228 enum nvme_dsm_type {
229         /* Dataset Management bit in ONCS reflects backing storage capability */
230         NVME_DATASET_MANAGEMENT_AUTO,
231         /* Unconditionally set Dataset Management bit in ONCS */
232         NVME_DATASET_MANAGEMENT_ENABLE,
233         /* Unconditionally clear Dataset Management bit in ONCS */
234         NVME_DATASET_MANAGEMENT_DISABLE,
235 };
236
237 struct pci_nvme_softc;
238 struct nvme_feature_obj;
239
240 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
241     struct nvme_feature_obj *,
242     struct nvme_command *,
243     struct nvme_completion *);
244
245 struct nvme_feature_obj {
246         uint32_t        cdw11;
247         nvme_feature_cb set;
248         nvme_feature_cb get;
249         bool namespace_specific;
250 };
251
252 #define NVME_FID_MAX            (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
253
254 struct pci_nvme_aer {
255         STAILQ_ENTRY(pci_nvme_aer) link;
256         uint16_t        cid;    /* Command ID of the submitted AER */
257 };
258
259 struct pci_nvme_softc {
260         struct pci_devinst *nsc_pi;
261
262         pthread_mutex_t mtx;
263
264         struct nvme_registers regs;
265
266         struct nvme_namespace_data  nsdata;
267         struct nvme_controller_data ctrldata;
268         struct nvme_error_information_entry err_log;
269         struct nvme_health_information_page health_log;
270         struct nvme_firmware_page fw_log;
271
272         struct pci_nvme_blockstore nvstore;
273
274         uint16_t        max_qentries;   /* max entries per queue */
275         uint32_t        max_queues;     /* max number of IO SQ's or CQ's */
276         uint32_t        num_cqueues;
277         uint32_t        num_squeues;
278         bool            num_q_is_set; /* Has host set Number of Queues */
279
280         struct pci_nvme_ioreq *ioreqs;
281         STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
282         uint32_t        pending_ios;
283         uint32_t        ioslots;
284         sem_t           iosemlock;
285
286         /*
287          * Memory mapped Submission and Completion queues
288          * Each array includes both Admin and IO queues
289          */
290         struct nvme_completion_queue *compl_queues;
291         struct nvme_submission_queue *submit_queues;
292
293         struct nvme_feature_obj feat[NVME_FID_MAX];
294
295         enum nvme_dsm_type dataset_management;
296
297         /* Accounting for SMART data */
298         __uint128_t     read_data_units;
299         __uint128_t     write_data_units;
300         __uint128_t     read_commands;
301         __uint128_t     write_commands;
302         uint32_t        read_dunits_remainder;
303         uint32_t        write_dunits_remainder;
304
305         STAILQ_HEAD(, pci_nvme_aer) aer_list;
306         uint32_t        aer_count;
307 };
308
309
310 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
311 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
312 static void pci_nvme_io_done(struct blockif_req *, int);
313
314 /* Controller Configuration utils */
315 #define NVME_CC_GET_EN(cc) \
316         ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
317 #define NVME_CC_GET_CSS(cc) \
318         ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
319 #define NVME_CC_GET_SHN(cc) \
320         ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
321 #define NVME_CC_GET_IOSQES(cc) \
322         ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
323 #define NVME_CC_GET_IOCQES(cc) \
324         ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
325
326 #define NVME_CC_WRITE_MASK \
327         ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
328          (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
329          (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
330
331 #define NVME_CC_NEN_WRITE_MASK \
332         ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
333          (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
334          (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
335
336 /* Controller Status utils */
337 #define NVME_CSTS_GET_RDY(sts) \
338         ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
339
340 #define NVME_CSTS_RDY   (1 << NVME_CSTS_REG_RDY_SHIFT)
341
342 /* Completion Queue status word utils */
343 #define NVME_STATUS_P   (1 << NVME_STATUS_P_SHIFT)
344 #define NVME_STATUS_MASK \
345         ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
346          (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
347
348 #define NVME_ONCS_DSM   (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
349         NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
350
351 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
352     struct nvme_feature_obj *,
353     struct nvme_command *,
354     struct nvme_completion *);
355 static void nvme_feature_num_queues(struct pci_nvme_softc *,
356     struct nvme_feature_obj *,
357     struct nvme_command *,
358     struct nvme_completion *);
359 static void nvme_feature_iv_config(struct pci_nvme_softc *,
360     struct nvme_feature_obj *,
361     struct nvme_command *,
362     struct nvme_completion *);
363
364 static __inline void
365 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
366 {
367         size_t len;
368
369         len = strnlen(src, dst_size);
370         memset(dst, pad, dst_size);
371         memcpy(dst, src, len);
372 }
373
374 static __inline void
375 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
376 {
377
378         *status &= ~NVME_STATUS_MASK;
379         *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
380                 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
381 }
382
383 static __inline void
384 pci_nvme_status_genc(uint16_t *status, uint16_t code)
385 {
386
387         pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
388 }
389
390 /*
391  * Initialize the requested number or IO Submission and Completion Queues.
392  * Admin queues are allocated implicitly.
393  */
394 static void
395 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
396 {
397         uint32_t i;
398
399         /*
400          * Allocate and initialize the Submission Queues
401          */
402         if (nsq > NVME_QUEUES) {
403                 WPRINTF("%s: clamping number of SQ from %u to %u",
404                                         __func__, nsq, NVME_QUEUES);
405                 nsq = NVME_QUEUES;
406         }
407
408         sc->num_squeues = nsq;
409
410         sc->submit_queues = calloc(sc->num_squeues + 1,
411                                 sizeof(struct nvme_submission_queue));
412         if (sc->submit_queues == NULL) {
413                 WPRINTF("%s: SQ allocation failed", __func__);
414                 sc->num_squeues = 0;
415         } else {
416                 struct nvme_submission_queue *sq = sc->submit_queues;
417
418                 for (i = 0; i < sc->num_squeues; i++)
419                         pthread_mutex_init(&sq[i].mtx, NULL);
420         }
421
422         /*
423          * Allocate and initialize the Completion Queues
424          */
425         if (ncq > NVME_QUEUES) {
426                 WPRINTF("%s: clamping number of CQ from %u to %u",
427                                         __func__, ncq, NVME_QUEUES);
428                 ncq = NVME_QUEUES;
429         }
430
431         sc->num_cqueues = ncq;
432
433         sc->compl_queues = calloc(sc->num_cqueues + 1,
434                                 sizeof(struct nvme_completion_queue));
435         if (sc->compl_queues == NULL) {
436                 WPRINTF("%s: CQ allocation failed", __func__);
437                 sc->num_cqueues = 0;
438         } else {
439                 struct nvme_completion_queue *cq = sc->compl_queues;
440
441                 for (i = 0; i < sc->num_cqueues; i++)
442                         pthread_mutex_init(&cq[i].mtx, NULL);
443         }
444 }
445
446 static void
447 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
448 {
449         struct nvme_controller_data *cd = &sc->ctrldata;
450
451         cd->vid = 0xFB5D;
452         cd->ssvid = 0x0000;
453
454         cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
455         cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
456
457         /* Num of submission commands that we can handle at a time (2^rab) */
458         cd->rab   = 4;
459
460         /* FreeBSD OUI */
461         cd->ieee[0] = 0x58;
462         cd->ieee[1] = 0x9c;
463         cd->ieee[2] = 0xfc;
464
465         cd->mic = 0;
466
467         cd->mdts = NVME_MDTS;   /* max data transfer size (2^mdts * CAP.MPSMIN) */
468
469         cd->ver = 0x00010300;
470
471         cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
472         cd->acl = 2;
473         cd->aerl = 4;
474
475         /* Advertise 1, Read-only firmware slot */
476         cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
477             (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
478         cd->lpa = 0;    /* TODO: support some simple things like SMART */
479         cd->elpe = 0;   /* max error log page entries */
480         cd->npss = 1;   /* number of power states support */
481
482         /* Warning Composite Temperature Threshold */
483         cd->wctemp = 0x0157;
484
485         cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
486             (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
487         cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
488             (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
489         cd->nn = 1;     /* number of namespaces */
490
491         cd->oncs = 0;
492         switch (sc->dataset_management) {
493         case NVME_DATASET_MANAGEMENT_AUTO:
494                 if (sc->nvstore.deallocate)
495                         cd->oncs |= NVME_ONCS_DSM;
496                 break;
497         case NVME_DATASET_MANAGEMENT_ENABLE:
498                 cd->oncs |= NVME_ONCS_DSM;
499                 break;
500         default:
501                 break;
502         }
503
504         cd->fna = 0x03;
505
506         cd->power_state[0].mp = 10;
507 }
508
509 /*
510  * Calculate the CRC-16 of the given buffer
511  * See copyright attribution at top of file
512  */
513 static uint16_t
514 crc16(uint16_t crc, const void *buffer, unsigned int len)
515 {
516         const unsigned char *cp = buffer;
517         /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
518         static uint16_t const crc16_table[256] = {
519                 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
520                 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
521                 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
522                 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
523                 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
524                 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
525                 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
526                 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
527                 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
528                 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
529                 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
530                 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
531                 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
532                 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
533                 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
534                 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
535                 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
536                 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
537                 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
538                 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
539                 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
540                 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
541                 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
542                 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
543                 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
544                 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
545                 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
546                 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
547                 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
548                 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
549                 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
550                 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
551         };
552
553         while (len--)
554                 crc = (((crc >> 8) & 0xffU) ^
555                     crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
556         return crc;
557 }
558
559 static void
560 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
561     struct nvme_namespace_data *nd, uint32_t nsid,
562     struct pci_nvme_blockstore *nvstore)
563 {
564
565         /* Get capacity and block size information from backing store */
566         nd->nsze = nvstore->size / nvstore->sectsz;
567         nd->ncap = nd->nsze;
568         nd->nuse = nd->nsze;
569
570         if (nvstore->type == NVME_STOR_BLOCKIF)
571                 nvstore->deallocate = blockif_candelete(nvstore->ctx);
572
573         nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
574         nd->flbas = 0;
575
576         /* Create an EUI-64 if user did not provide one */
577         if (nvstore->eui64 == 0) {
578                 char *data = NULL;
579                 uint64_t eui64 = nvstore->eui64;
580
581                 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
582                     sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
583
584                 if (data != NULL) {
585                         eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
586                         free(data);
587                 }
588                 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
589         }
590         be64enc(nd->eui64, nvstore->eui64);
591
592         /* LBA data-sz = 2^lbads */
593         nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
594 }
595
596 static void
597 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
598 {
599
600         memset(&sc->err_log, 0, sizeof(sc->err_log));
601         memset(&sc->health_log, 0, sizeof(sc->health_log));
602         memset(&sc->fw_log, 0, sizeof(sc->fw_log));
603
604         /* Set read/write remainder to round up according to spec */
605         sc->read_dunits_remainder = 999;
606         sc->write_dunits_remainder = 999;
607
608         /* Set nominal Health values checked by implementations */
609         sc->health_log.temperature = 310;
610         sc->health_log.available_spare = 100;
611         sc->health_log.available_spare_threshold = 10;
612 }
613
614 static void
615 pci_nvme_init_features(struct pci_nvme_softc *sc)
616 {
617
618         sc->feat[0].set = nvme_feature_invalid_cb;
619         sc->feat[0].get = nvme_feature_invalid_cb;
620
621         sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
622         sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
623         sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
624         sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set =
625             nvme_feature_iv_config;
626         sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get =
627             nvme_feature_invalid_cb;
628         sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get =
629             nvme_feature_invalid_cb;
630 }
631
632 static void
633 pci_nvme_aer_init(struct pci_nvme_softc *sc)
634 {
635
636         STAILQ_INIT(&sc->aer_list);
637         sc->aer_count = 0;
638 }
639
640 static void
641 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
642 {
643         struct pci_nvme_aer *aer = NULL;
644
645         while (!STAILQ_EMPTY(&sc->aer_list)) {
646                 aer = STAILQ_FIRST(&sc->aer_list);
647                 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
648                 free(aer);
649         }
650
651         pci_nvme_aer_init(sc);
652 }
653
654 static bool
655 pci_nvme_aer_available(struct pci_nvme_softc *sc)
656 {
657
658         return (!STAILQ_EMPTY(&sc->aer_list));
659 }
660
661 static bool
662 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
663 {
664         struct nvme_controller_data *cd = &sc->ctrldata;
665
666         /* AERL is a zero based value while aer_count is one's based */
667         return (sc->aer_count == (cd->aerl + 1));
668 }
669
670 /*
671  * Add an Async Event Request
672  *
673  * Stores an AER to be returned later if the Controller needs to notify the
674  * host of an event.
675  * Note that while the NVMe spec doesn't require Controllers to return AER's
676  * in order, this implementation does preserve the order.
677  */
678 static int
679 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
680 {
681         struct pci_nvme_aer *aer = NULL;
682
683         if (pci_nvme_aer_limit_reached(sc))
684                 return (-1);
685
686         aer = calloc(1, sizeof(struct pci_nvme_aer));
687         if (aer == NULL)
688                 return (-1);
689
690         sc->aer_count++;
691
692         /* Save the Command ID for use in the completion message */
693         aer->cid = cid;
694         STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
695
696         return (0);
697 }
698
699 /*
700  * Get an Async Event Request structure
701  *
702  * Returns a pointer to an AER previously submitted by the host or NULL if
703  * no AER's exist. Caller is responsible for freeing the returned struct.
704  */
705 static struct pci_nvme_aer *
706 pci_nvme_aer_get(struct pci_nvme_softc *sc)
707 {
708         struct pci_nvme_aer *aer = NULL;
709
710         aer = STAILQ_FIRST(&sc->aer_list);
711         if (aer != NULL) {
712                 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
713                 sc->aer_count--;
714         }
715         
716         return (aer);
717 }
718
719 static void
720 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
721 {
722         uint32_t i;
723
724         DPRINTF("%s", __func__);
725
726         sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
727             (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
728             (60 << NVME_CAP_LO_REG_TO_SHIFT);
729
730         sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
731
732         sc->regs.vs = 0x00010300;       /* NVMe v1.3 */
733
734         sc->regs.cc = 0;
735         sc->regs.csts = 0;
736
737         assert(sc->submit_queues != NULL);
738
739         for (i = 0; i < sc->num_squeues + 1; i++) {
740                 sc->submit_queues[i].qbase = NULL;
741                 sc->submit_queues[i].size = 0;
742                 sc->submit_queues[i].cqid = 0;
743                 sc->submit_queues[i].tail = 0;
744                 sc->submit_queues[i].head = 0;
745         }
746
747         assert(sc->compl_queues != NULL);
748
749         for (i = 0; i < sc->num_cqueues + 1; i++) {
750                 sc->compl_queues[i].qbase = NULL;
751                 sc->compl_queues[i].size = 0;
752                 sc->compl_queues[i].tail = 0;
753                 sc->compl_queues[i].head = 0;
754         }
755
756         sc->num_q_is_set = false;
757
758         pci_nvme_aer_destroy(sc);
759 }
760
761 static void
762 pci_nvme_reset(struct pci_nvme_softc *sc)
763 {
764         pthread_mutex_lock(&sc->mtx);
765         pci_nvme_reset_locked(sc);
766         pthread_mutex_unlock(&sc->mtx);
767 }
768
769 static void
770 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
771 {
772         uint16_t acqs, asqs;
773
774         DPRINTF("%s", __func__);
775
776         asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
777         sc->submit_queues[0].size = asqs;
778         sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
779                     sizeof(struct nvme_command) * asqs);
780
781         DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
782                 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
783
784         acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 
785             NVME_AQA_REG_ACQS_MASK) + 1;
786         sc->compl_queues[0].size = acqs;
787         sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
788                  sizeof(struct nvme_completion) * acqs);
789         sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
790
791         DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
792                 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
793 }
794
795 static int
796 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
797         size_t len, enum nvme_copy_dir dir)
798 {
799         uint8_t *p;
800         size_t bytes;
801
802         if (len > (8 * 1024)) {
803                 return (-1);
804         }
805
806         /* Copy from the start of prp1 to the end of the physical page */
807         bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
808         bytes = MIN(bytes, len);
809
810         p = vm_map_gpa(ctx, prp1, bytes);
811         if (p == NULL) {
812                 return (-1);
813         }
814
815         if (dir == NVME_COPY_TO_PRP)
816                 memcpy(p, b, bytes);
817         else
818                 memcpy(b, p, bytes);
819
820         b += bytes;
821
822         len -= bytes;
823         if (len == 0) {
824                 return (0);
825         }
826
827         len = MIN(len, PAGE_SIZE);
828
829         p = vm_map_gpa(ctx, prp2, len);
830         if (p == NULL) {
831                 return (-1);
832         }
833
834         if (dir == NVME_COPY_TO_PRP)
835                 memcpy(p, b, len);
836         else
837                 memcpy(b, p, len);
838
839         return (0);
840 }
841
842 /*
843  * Write a Completion Queue Entry update
844  *
845  * Write the completion and update the doorbell value
846  */
847 static void
848 pci_nvme_cq_update(struct pci_nvme_softc *sc,
849                 struct nvme_completion_queue *cq,
850                 uint32_t cdw0,
851                 uint16_t cid,
852                 uint16_t sqid,
853                 uint16_t status)
854 {
855         struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
856         struct nvme_completion *cqe;
857
858         assert(cq->qbase != NULL);
859
860         pthread_mutex_lock(&cq->mtx);
861
862         cqe = &cq->qbase[cq->tail];
863
864         /* Flip the phase bit */
865         status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
866
867         cqe->cdw0 = cdw0;
868         cqe->sqhd = sq->head;
869         cqe->sqid = sqid;
870         cqe->cid = cid;
871         cqe->status = status;
872
873         cq->tail++;
874         if (cq->tail >= cq->size) {
875                 cq->tail = 0;
876         }
877
878         pthread_mutex_unlock(&cq->mtx);
879 }
880
881 static int
882 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
883         struct nvme_completion* compl)
884 {
885         uint16_t qid = command->cdw10 & 0xffff;
886
887         DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
888         if (qid == 0 || qid > sc->num_squeues ||
889             (sc->submit_queues[qid].qbase == NULL)) {
890                 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
891                         __func__, qid, sc->num_squeues);
892                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
893                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
894                 return (1);
895         }
896
897         sc->submit_queues[qid].qbase = NULL;
898         sc->submit_queues[qid].cqid = 0;
899         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
900         return (1);
901 }
902
903 static int
904 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
905         struct nvme_completion* compl)
906 {
907         if (command->cdw11 & NVME_CMD_CDW11_PC) {
908                 uint16_t qid = command->cdw10 & 0xffff;
909                 struct nvme_submission_queue *nsq;
910
911                 if ((qid == 0) || (qid > sc->num_squeues) ||
912                     (sc->submit_queues[qid].qbase != NULL)) {
913                         WPRINTF("%s queue index %u > num_squeues %u",
914                                 __func__, qid, sc->num_squeues);
915                         pci_nvme_status_tc(&compl->status,
916                             NVME_SCT_COMMAND_SPECIFIC,
917                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
918                         return (1);
919                 }
920
921                 nsq = &sc->submit_queues[qid];
922                 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
923                 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
924                 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
925                         /*
926                          * Queues must specify at least two entries
927                          * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
928                          * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
929                          */
930                         pci_nvme_status_tc(&compl->status,
931                             NVME_SCT_COMMAND_SPECIFIC,
932                             NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
933                         return (1);
934                 }
935
936                 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
937                 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
938                         pci_nvme_status_tc(&compl->status,
939                             NVME_SCT_COMMAND_SPECIFIC,
940                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
941                         return (1);
942                 }
943
944                 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
945                         pci_nvme_status_tc(&compl->status,
946                             NVME_SCT_COMMAND_SPECIFIC,
947                             NVME_SC_COMPLETION_QUEUE_INVALID);
948                         return (1);
949                 }
950
951                 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
952
953                 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
954                               sizeof(struct nvme_command) * (size_t)nsq->size);
955
956                 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
957                         qid, nsq->size, nsq->qbase, nsq->cqid);
958
959                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
960
961                 DPRINTF("%s completed creating IOSQ qid %u",
962                          __func__, qid);
963         } else {
964                 /* 
965                  * Guest sent non-cont submission queue request.
966                  * This setting is unsupported by this emulation.
967                  */
968                 WPRINTF("%s unsupported non-contig (list-based) "
969                          "create i/o submission queue", __func__);
970
971                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
972         }
973         return (1);
974 }
975
976 static int
977 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
978         struct nvme_completion* compl)
979 {
980         uint16_t qid = command->cdw10 & 0xffff;
981         uint16_t sqid;
982
983         DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
984         if (qid == 0 || qid > sc->num_cqueues ||
985             (sc->compl_queues[qid].qbase == NULL)) {
986                 WPRINTF("%s queue index %u / num_cqueues %u",
987                         __func__, qid, sc->num_cqueues);
988                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
989                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
990                 return (1);
991         }
992
993         /* Deleting an Active CQ is an error */
994         for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
995                 if (sc->submit_queues[sqid].cqid == qid) {
996                         pci_nvme_status_tc(&compl->status,
997                             NVME_SCT_COMMAND_SPECIFIC,
998                             NVME_SC_INVALID_QUEUE_DELETION);
999                         return (1);
1000                 }
1001
1002         sc->compl_queues[qid].qbase = NULL;
1003         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1004         return (1);
1005 }
1006
1007 static int
1008 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1009         struct nvme_completion* compl)
1010 {
1011         struct nvme_completion_queue *ncq;
1012         uint16_t qid = command->cdw10 & 0xffff;
1013
1014         /* Only support Physically Contiguous queues */
1015         if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1016                 WPRINTF("%s unsupported non-contig (list-based) "
1017                          "create i/o completion queue",
1018                          __func__);
1019
1020                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1021                 return (1);
1022         }
1023
1024         if ((qid == 0) || (qid > sc->num_cqueues) ||
1025             (sc->compl_queues[qid].qbase != NULL)) {
1026                 WPRINTF("%s queue index %u > num_cqueues %u",
1027                         __func__, qid, sc->num_cqueues);
1028                 pci_nvme_status_tc(&compl->status,
1029                     NVME_SCT_COMMAND_SPECIFIC,
1030                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
1031                 return (1);
1032         }
1033
1034         ncq = &sc->compl_queues[qid];
1035         ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1036         ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1037         if (ncq->intr_vec > (sc->max_queues + 1)) {
1038                 pci_nvme_status_tc(&compl->status,
1039                     NVME_SCT_COMMAND_SPECIFIC,
1040                     NVME_SC_INVALID_INTERRUPT_VECTOR);
1041                 return (1);
1042         }
1043
1044         ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1045         if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1046                 /*
1047                  * Queues must specify at least two entries
1048                  * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1049                  * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1050                  */
1051                 pci_nvme_status_tc(&compl->status,
1052                     NVME_SCT_COMMAND_SPECIFIC,
1053                     NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1054                 return (1);
1055         }
1056         ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1057                      command->prp1,
1058                      sizeof(struct nvme_command) * (size_t)ncq->size);
1059
1060         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1061
1062
1063         return (1);
1064 }
1065
1066 static int
1067 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1068         struct nvme_completion* compl)
1069 {
1070         uint32_t logsize;
1071         uint8_t logpage = command->cdw10 & 0xFF;
1072
1073         DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1074
1075         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1076
1077         /*
1078          * Command specifies the number of dwords to return in fields NUMDU
1079          * and NUMDL. This is a zero-based value.
1080          */
1081         logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1082         logsize *= sizeof(uint32_t);
1083
1084         switch (logpage) {
1085         case NVME_LOG_ERROR:
1086                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1087                     command->prp2, (uint8_t *)&sc->err_log,
1088                     MIN(logsize, sizeof(sc->err_log)),
1089                     NVME_COPY_TO_PRP);
1090                 break;
1091         case NVME_LOG_HEALTH_INFORMATION:
1092                 pthread_mutex_lock(&sc->mtx);
1093                 memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1094                     sizeof(sc->health_log.data_units_read));
1095                 memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1096                     sizeof(sc->health_log.data_units_written));
1097                 memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1098                     sizeof(sc->health_log.host_read_commands));
1099                 memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1100                     sizeof(sc->health_log.host_write_commands));
1101                 pthread_mutex_unlock(&sc->mtx);
1102
1103                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1104                     command->prp2, (uint8_t *)&sc->health_log,
1105                     MIN(logsize, sizeof(sc->health_log)),
1106                     NVME_COPY_TO_PRP);
1107                 break;
1108         case NVME_LOG_FIRMWARE_SLOT:
1109                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1110                     command->prp2, (uint8_t *)&sc->fw_log,
1111                     MIN(logsize, sizeof(sc->fw_log)),
1112                     NVME_COPY_TO_PRP);
1113                 break;
1114         default:
1115                 DPRINTF("%s get log page %x command not supported",
1116                         __func__, logpage);
1117
1118                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1119                     NVME_SC_INVALID_LOG_PAGE);
1120         }
1121
1122         return (1);
1123 }
1124
1125 static int
1126 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1127         struct nvme_completion* compl)
1128 {
1129         void *dest;
1130         uint16_t status;
1131
1132         DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1133                 command->cdw10 & 0xFF, command->nsid);
1134
1135         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1136
1137         switch (command->cdw10 & 0xFF) {
1138         case 0x00: /* return Identify Namespace data structure */
1139                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1140                     command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1141                     NVME_COPY_TO_PRP);
1142                 break;
1143         case 0x01: /* return Identify Controller data structure */
1144                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1145                     command->prp2, (uint8_t *)&sc->ctrldata,
1146                     sizeof(sc->ctrldata),
1147                     NVME_COPY_TO_PRP);
1148                 break;
1149         case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1150                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1151                                   sizeof(uint32_t) * 1024);
1152                 /* All unused entries shall be zero */
1153                 bzero(dest, sizeof(uint32_t) * 1024);
1154                 ((uint32_t *)dest)[0] = 1;
1155                 break;
1156         case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1157                 if (command->nsid != 1) {
1158                         pci_nvme_status_genc(&status,
1159                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1160                         break;
1161                 }
1162                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1163                                   sizeof(uint32_t) * 1024);
1164                 /* All bytes after the descriptor shall be zero */
1165                 bzero(dest, sizeof(uint32_t) * 1024);
1166
1167                 /* Return NIDT=1 (i.e. EUI64) descriptor */
1168                 ((uint8_t *)dest)[0] = 1;
1169                 ((uint8_t *)dest)[1] = sizeof(uint64_t);
1170                 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1171                 break;
1172         default:
1173                 DPRINTF("%s unsupported identify command requested 0x%x",
1174                          __func__, command->cdw10 & 0xFF);
1175                 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1176                 break;
1177         }
1178
1179         compl->status = status;
1180         return (1);
1181 }
1182
1183 static const char *
1184 nvme_fid_to_name(uint8_t fid)
1185 {
1186         const char *name;
1187
1188         switch (fid) {
1189         case NVME_FEAT_ARBITRATION:
1190                 name = "Arbitration";
1191                 break;
1192         case NVME_FEAT_POWER_MANAGEMENT:
1193                 name = "Power Management";
1194                 break;
1195         case NVME_FEAT_LBA_RANGE_TYPE:
1196                 name = "LBA Range Type";
1197                 break;
1198         case NVME_FEAT_TEMPERATURE_THRESHOLD:
1199                 name = "Temperature Threshold";
1200                 break;
1201         case NVME_FEAT_ERROR_RECOVERY:
1202                 name = "Error Recovery";
1203                 break;
1204         case NVME_FEAT_VOLATILE_WRITE_CACHE:
1205                 name = "Volatile Write Cache";
1206                 break;
1207         case NVME_FEAT_NUMBER_OF_QUEUES:
1208                 name = "Number of Queues";
1209                 break;
1210         case NVME_FEAT_INTERRUPT_COALESCING:
1211                 name = "Interrupt Coalescing";
1212                 break;
1213         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1214                 name = "Interrupt Vector Configuration";
1215                 break;
1216         case NVME_FEAT_WRITE_ATOMICITY:
1217                 name = "Write Atomicity Normal";
1218                 break;
1219         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1220                 name = "Asynchronous Event Configuration";
1221                 break;
1222         case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1223                 name = "Autonomous Power State Transition";
1224                 break;
1225         case NVME_FEAT_HOST_MEMORY_BUFFER:
1226                 name = "Host Memory Buffer";
1227                 break;
1228         case NVME_FEAT_TIMESTAMP:
1229                 name = "Timestamp";
1230                 break;
1231         case NVME_FEAT_KEEP_ALIVE_TIMER:
1232                 name = "Keep Alive Timer";
1233                 break;
1234         case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1235                 name = "Host Controlled Thermal Management";
1236                 break;
1237         case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1238                 name = "Non-Operation Power State Config";
1239                 break;
1240         case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1241                 name = "Read Recovery Level Config";
1242                 break;
1243         case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1244                 name = "Predictable Latency Mode Config";
1245                 break;
1246         case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1247                 name = "Predictable Latency Mode Window";
1248                 break;
1249         case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1250                 name = "LBA Status Information Report Interval";
1251                 break;
1252         case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1253                 name = "Host Behavior Support";
1254                 break;
1255         case NVME_FEAT_SANITIZE_CONFIG:
1256                 name = "Sanitize Config";
1257                 break;
1258         case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1259                 name = "Endurance Group Event Configuration";
1260                 break;
1261         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1262                 name = "Software Progress Marker";
1263                 break;
1264         case NVME_FEAT_HOST_IDENTIFIER:
1265                 name = "Host Identifier";
1266                 break;
1267         case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1268                 name = "Reservation Notification Mask";
1269                 break;
1270         case NVME_FEAT_RESERVATION_PERSISTENCE:
1271                 name = "Reservation Persistence";
1272                 break;
1273         case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1274                 name = "Namespace Write Protection Config";
1275                 break;
1276         default:
1277                 name = "Unknown";
1278                 break;
1279         }
1280
1281         return (name);
1282 }
1283
1284 static void
1285 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1286     struct nvme_feature_obj *feat,
1287     struct nvme_command *command,
1288     struct nvme_completion *compl)
1289 {
1290
1291         pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1292 }
1293
1294 static void
1295 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1296     struct nvme_feature_obj *feat,
1297     struct nvme_command *command,
1298     struct nvme_completion *compl)
1299 {
1300         uint32_t i;
1301         uint32_t cdw11 = command->cdw11;
1302         uint16_t iv;
1303         bool cd;
1304
1305         pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1306
1307         iv = cdw11 & 0xffff;
1308         cd = cdw11 & (1 << 16);
1309
1310         if (iv > (sc->max_queues + 1)) {
1311                 return;
1312         }
1313
1314         /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1315         if ((iv == 0) && !cd)
1316                 return;
1317
1318         /* Requested Interrupt Vector must be used by a CQ */
1319         for (i = 0; i < sc->num_cqueues + 1; i++) {
1320                 if (sc->compl_queues[i].intr_vec == iv) {
1321                         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1322                 }
1323         }
1324
1325 }
1326
1327 static void
1328 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1329     struct nvme_feature_obj *feat,
1330     struct nvme_command *command,
1331     struct nvme_completion *compl)
1332 {
1333         uint16_t nqr;   /* Number of Queues Requested */
1334
1335         if (sc->num_q_is_set) {
1336                 WPRINTF("%s: Number of Queues already set", __func__);
1337                 pci_nvme_status_genc(&compl->status,
1338                     NVME_SC_COMMAND_SEQUENCE_ERROR);
1339                 return;
1340         }
1341
1342         nqr = command->cdw11 & 0xFFFF;
1343         if (nqr == 0xffff) {
1344                 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1345                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1346                 return;
1347         }
1348
1349         sc->num_squeues = ONE_BASED(nqr);
1350         if (sc->num_squeues > sc->max_queues) {
1351                 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1352                                         sc->max_queues);
1353                 sc->num_squeues = sc->max_queues;
1354         }
1355
1356         nqr = (command->cdw11 >> 16) & 0xFFFF;
1357         if (nqr == 0xffff) {
1358                 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1359                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1360                 return;
1361         }
1362
1363         sc->num_cqueues = ONE_BASED(nqr);
1364         if (sc->num_cqueues > sc->max_queues) {
1365                 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1366                                         sc->max_queues);
1367                 sc->num_cqueues = sc->max_queues;
1368         }
1369
1370         /* Patch the command value which will be saved on callback's return */
1371         command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1372         compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1373
1374         sc->num_q_is_set = true;
1375 }
1376
1377 static int
1378 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1379         struct nvme_completion *compl)
1380 {
1381         struct nvme_feature_obj *feat;
1382         uint32_t nsid = command->nsid;
1383         uint8_t fid = command->cdw10 & 0xFF;
1384
1385         DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1386
1387         if (fid >= NVME_FID_MAX) {
1388                 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1389                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1390                 return (1);
1391         }
1392         feat = &sc->feat[fid];
1393
1394         if (!feat->namespace_specific &&
1395             !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1396                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1397                     NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1398                 return (1);
1399         }
1400
1401         compl->cdw0 = 0;
1402         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1403
1404         if (feat->set)
1405                 feat->set(sc, feat, command, compl);
1406
1407         if (compl->status == NVME_SC_SUCCESS)
1408                 feat->cdw11 = command->cdw11;
1409
1410         return (0);
1411 }
1412
1413 static int
1414 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1415         struct nvme_completion* compl)
1416 {
1417         struct nvme_feature_obj *feat;
1418         uint8_t fid = command->cdw10 & 0xFF;
1419
1420         DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1421
1422         if (fid >= NVME_FID_MAX) {
1423                 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1424                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1425                 return (1);
1426         }
1427
1428         compl->cdw0 = 0;
1429         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1430
1431         feat = &sc->feat[fid];
1432         if (feat->get) {
1433                 feat->get(sc, feat, command, compl);
1434         }
1435
1436         if (compl->status == NVME_SC_SUCCESS) {
1437                 compl->cdw0 = feat->cdw11;
1438         }
1439
1440         return (0);
1441 }
1442
1443 static int
1444 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1445         struct nvme_completion* compl)
1446 {
1447         uint8_t ses, lbaf, pi;
1448
1449         /* Only supports Secure Erase Setting - User Data Erase */
1450         ses = (command->cdw10 >> 9) & 0x7;
1451         if (ses > 0x1) {
1452                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1453                 return (1);
1454         }
1455
1456         /* Only supports a single LBA Format */
1457         lbaf = command->cdw10 & 0xf;
1458         if (lbaf != 0) {
1459                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1460                     NVME_SC_INVALID_FORMAT);
1461                 return (1);
1462         }
1463
1464         /* Doesn't support Protection Infomation */
1465         pi = (command->cdw10 >> 5) & 0x7;
1466         if (pi != 0) {
1467                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1468                 return (1);
1469         }
1470
1471         if (sc->nvstore.type == NVME_STOR_RAM) {
1472                 if (sc->nvstore.ctx)
1473                         free(sc->nvstore.ctx);
1474                 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1475                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1476         } else {
1477                 struct pci_nvme_ioreq *req;
1478                 int err;
1479
1480                 req = pci_nvme_get_ioreq(sc);
1481                 if (req == NULL) {
1482                         pci_nvme_status_genc(&compl->status,
1483                             NVME_SC_INTERNAL_DEVICE_ERROR);
1484                         WPRINTF("%s: unable to allocate IO req", __func__);
1485                         return (1);
1486                 }
1487                 req->nvme_sq = &sc->submit_queues[0];
1488                 req->sqid = 0;
1489                 req->opc = command->opc;
1490                 req->cid = command->cid;
1491                 req->nsid = command->nsid;
1492
1493                 req->io_req.br_offset = 0;
1494                 req->io_req.br_resid = sc->nvstore.size;
1495                 req->io_req.br_callback = pci_nvme_io_done;
1496
1497                 err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1498                 if (err) {
1499                         pci_nvme_status_genc(&compl->status,
1500                             NVME_SC_INTERNAL_DEVICE_ERROR);
1501                         pci_nvme_release_ioreq(sc, req);
1502                 }
1503         }
1504
1505         return (1);
1506 }
1507
1508 static int
1509 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1510         struct nvme_completion* compl)
1511 {
1512         DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1513                 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1514
1515         /* TODO: search for the command ID and abort it */
1516
1517         compl->cdw0 = 1;
1518         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1519         return (1);
1520 }
1521
1522 static int
1523 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1524         struct nvme_command* command, struct nvme_completion* compl)
1525 {
1526         DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1527
1528         /* Don't exceed the Async Event Request Limit (AERL). */
1529         if (pci_nvme_aer_limit_reached(sc)) {
1530                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1531                                 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1532                 return (1);
1533         }
1534
1535         if (pci_nvme_aer_add(sc, command->cid)) {
1536                 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1537                                 NVME_SC_INTERNAL_DEVICE_ERROR);
1538                 return (1);
1539         }
1540
1541         /*
1542          * Raise events when they happen based on the Set Features cmd.
1543          * These events happen async, so only set completion successful if
1544          * there is an event reflective of the request to get event.
1545          */
1546         compl->status = NVME_NO_STATUS;
1547
1548         return (0);
1549 }
1550
1551 static void
1552 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1553 {
1554         struct nvme_completion compl;
1555         struct nvme_command *cmd;
1556         struct nvme_submission_queue *sq;
1557         struct nvme_completion_queue *cq;
1558         uint16_t sqhead;
1559
1560         DPRINTF("%s index %u", __func__, (uint32_t)value);
1561
1562         sq = &sc->submit_queues[0];
1563         cq = &sc->compl_queues[0];
1564
1565         pthread_mutex_lock(&sq->mtx);
1566
1567         sqhead = sq->head;
1568         DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1569         
1570         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1571                 cmd = &(sq->qbase)[sqhead];
1572                 compl.cdw0 = 0;
1573                 compl.status = 0;
1574
1575                 switch (cmd->opc) {
1576                 case NVME_OPC_DELETE_IO_SQ:
1577                         DPRINTF("%s command DELETE_IO_SQ", __func__);
1578                         nvme_opc_delete_io_sq(sc, cmd, &compl);
1579                         break;
1580                 case NVME_OPC_CREATE_IO_SQ:
1581                         DPRINTF("%s command CREATE_IO_SQ", __func__);
1582                         nvme_opc_create_io_sq(sc, cmd, &compl);
1583                         break;
1584                 case NVME_OPC_DELETE_IO_CQ:
1585                         DPRINTF("%s command DELETE_IO_CQ", __func__);
1586                         nvme_opc_delete_io_cq(sc, cmd, &compl);
1587                         break;
1588                 case NVME_OPC_CREATE_IO_CQ:
1589                         DPRINTF("%s command CREATE_IO_CQ", __func__);
1590                         nvme_opc_create_io_cq(sc, cmd, &compl);
1591                         break;
1592                 case NVME_OPC_GET_LOG_PAGE:
1593                         DPRINTF("%s command GET_LOG_PAGE", __func__);
1594                         nvme_opc_get_log_page(sc, cmd, &compl);
1595                         break;
1596                 case NVME_OPC_IDENTIFY:
1597                         DPRINTF("%s command IDENTIFY", __func__);
1598                         nvme_opc_identify(sc, cmd, &compl);
1599                         break;
1600                 case NVME_OPC_ABORT:
1601                         DPRINTF("%s command ABORT", __func__);
1602                         nvme_opc_abort(sc, cmd, &compl);
1603                         break;
1604                 case NVME_OPC_SET_FEATURES:
1605                         DPRINTF("%s command SET_FEATURES", __func__);
1606                         nvme_opc_set_features(sc, cmd, &compl);
1607                         break;
1608                 case NVME_OPC_GET_FEATURES:
1609                         DPRINTF("%s command GET_FEATURES", __func__);
1610                         nvme_opc_get_features(sc, cmd, &compl);
1611                         break;
1612                 case NVME_OPC_FIRMWARE_ACTIVATE:
1613                         DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
1614                         pci_nvme_status_tc(&compl.status,
1615                             NVME_SCT_COMMAND_SPECIFIC,
1616                             NVME_SC_INVALID_FIRMWARE_SLOT);
1617                         break;
1618                 case NVME_OPC_ASYNC_EVENT_REQUEST:
1619                         DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1620                         nvme_opc_async_event_req(sc, cmd, &compl);
1621                         break;
1622                 case NVME_OPC_FORMAT_NVM:
1623                         DPRINTF("%s command FORMAT_NVM", __func__);
1624                         if ((sc->ctrldata.oacs &
1625                             (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1626                                 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1627                         }
1628                         compl.status = NVME_NO_STATUS;
1629                         nvme_opc_format_nvm(sc, cmd, &compl);
1630                         break;
1631                 default:
1632                         DPRINTF("0x%x command is not implemented",
1633                             cmd->opc);
1634                         pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1635                 }
1636                 sqhead = (sqhead + 1) % sq->size;
1637
1638                 if (NVME_COMPLETION_VALID(compl)) {
1639                         pci_nvme_cq_update(sc, &sc->compl_queues[0],
1640                             compl.cdw0,
1641                             cmd->cid,
1642                             0,          /* SQID */
1643                             compl.status);
1644                 }
1645         }
1646
1647         DPRINTF("setting sqhead %u", sqhead);
1648         sq->head = sqhead;
1649
1650         if (cq->head != cq->tail)
1651                 pci_generate_msix(sc->nsc_pi, 0);
1652
1653         pthread_mutex_unlock(&sq->mtx);
1654 }
1655
1656 /*
1657  * Update the Write and Read statistics reported in SMART data
1658  *
1659  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
1660  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
1661  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
1662  */
1663 static void
1664 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
1665     size_t bytes, uint16_t status)
1666 {
1667
1668         pthread_mutex_lock(&sc->mtx);
1669         switch (opc) {
1670         case NVME_OPC_WRITE:
1671                 sc->write_commands++;
1672                 if (status != NVME_SC_SUCCESS)
1673                         break;
1674                 sc->write_dunits_remainder += (bytes / 512);
1675                 while (sc->write_dunits_remainder >= 1000) {
1676                         sc->write_data_units++;
1677                         sc->write_dunits_remainder -= 1000;
1678                 }
1679                 break;
1680         case NVME_OPC_READ:
1681                 sc->read_commands++;
1682                 if (status != NVME_SC_SUCCESS)
1683                         break;
1684                 sc->read_dunits_remainder += (bytes / 512);
1685                 while (sc->read_dunits_remainder >= 1000) {
1686                         sc->read_data_units++;
1687                         sc->read_dunits_remainder -= 1000;
1688                 }
1689                 break;
1690         default:
1691                 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
1692                 break;
1693         }
1694         pthread_mutex_unlock(&sc->mtx);
1695 }
1696
1697 /*
1698  * Check if the combination of Starting LBA (slba) and Number of Logical
1699  * Blocks (nlb) exceeds the range of the underlying storage.
1700  *
1701  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
1702  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
1703  * overflow.
1704  */
1705 static bool
1706 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
1707     uint32_t nlb)
1708 {
1709         size_t  offset, bytes;
1710
1711         /* Overflow check of multiplying Starting LBA by the sector size */
1712         if (slba >> (64 - nvstore->sectsz_bits))
1713                 return (true);
1714
1715         offset = slba << nvstore->sectsz_bits;
1716         bytes = nlb << nvstore->sectsz_bits;
1717
1718         /* Overflow check of Number of Logical Blocks */
1719         if ((nvstore->size - offset) < bytes)
1720                 return (true);
1721
1722         return (false);
1723 }
1724
1725 static int
1726 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1727         uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1728 {
1729         int iovidx;
1730
1731         if (req == NULL)
1732                 return (-1);
1733
1734         if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
1735                 return (-1);
1736         }
1737
1738         /* concatenate contig block-iovs to minimize number of iovs */
1739         if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1740                 iovidx = req->io_req.br_iovcnt - 1;
1741
1742                 req->io_req.br_iov[iovidx].iov_base =
1743                     paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1744                                      req->prev_gpaddr, size);
1745
1746                 req->prev_size += size;
1747                 req->io_req.br_resid += size;
1748
1749                 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1750         } else {
1751                 iovidx = req->io_req.br_iovcnt;
1752                 if (iovidx == 0) {
1753                         req->io_req.br_offset = lba;
1754                         req->io_req.br_resid = 0;
1755                         req->io_req.br_param = req;
1756                 }
1757
1758                 req->io_req.br_iov[iovidx].iov_base =
1759                     paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1760                                      gpaddr, size);
1761
1762                 req->io_req.br_iov[iovidx].iov_len = size;
1763
1764                 req->prev_gpaddr = gpaddr;
1765                 req->prev_size = size;
1766                 req->io_req.br_resid += size;
1767
1768                 req->io_req.br_iovcnt++;
1769         }
1770
1771         return (0);
1772 }
1773
1774 static void
1775 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1776         struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1777         uint32_t cdw0, uint16_t status)
1778 {
1779         struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1780
1781         DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1782                  __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1783                  NVME_STATUS_GET_SC(status));
1784
1785         pci_nvme_cq_update(sc, cq,
1786             0,          /* CDW0 */
1787             cid,
1788             sqid,
1789             status);
1790
1791         if (cq->head != cq->tail) {
1792                 if (cq->intr_en & NVME_CQ_INTEN) {
1793                         pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1794                 } else {
1795                         DPRINTF("%s: CQ%u interrupt disabled",
1796                                                 __func__, sq->cqid);
1797                 }
1798         }
1799 }
1800
1801 static void
1802 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1803 {
1804         req->sc = NULL;
1805         req->nvme_sq = NULL;
1806         req->sqid = 0;
1807
1808         pthread_mutex_lock(&sc->mtx);
1809
1810         STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1811         sc->pending_ios--;
1812
1813         /* when no more IO pending, can set to ready if device reset/enabled */
1814         if (sc->pending_ios == 0 &&
1815             NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1816                 sc->regs.csts |= NVME_CSTS_RDY;
1817
1818         pthread_mutex_unlock(&sc->mtx);
1819
1820         sem_post(&sc->iosemlock);
1821 }
1822
1823 static struct pci_nvme_ioreq *
1824 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1825 {
1826         struct pci_nvme_ioreq *req = NULL;;
1827
1828         sem_wait(&sc->iosemlock);
1829         pthread_mutex_lock(&sc->mtx);
1830
1831         req = STAILQ_FIRST(&sc->ioreqs_free);
1832         assert(req != NULL);
1833         STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1834
1835         req->sc = sc;
1836
1837         sc->pending_ios++;
1838
1839         pthread_mutex_unlock(&sc->mtx);
1840
1841         req->io_req.br_iovcnt = 0;
1842         req->io_req.br_offset = 0;
1843         req->io_req.br_resid = 0;
1844         req->io_req.br_param = req;
1845         req->prev_gpaddr = 0;
1846         req->prev_size = 0;
1847
1848         return req;
1849 }
1850
1851 static void
1852 pci_nvme_io_done(struct blockif_req *br, int err)
1853 {
1854         struct pci_nvme_ioreq *req = br->br_param;
1855         struct nvme_submission_queue *sq = req->nvme_sq;
1856         uint16_t code, status;
1857
1858         DPRINTF("%s error %d %s", __func__, err, strerror(err));
1859
1860         /* TODO return correct error */
1861         code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1862         pci_nvme_status_genc(&status, code);
1863
1864         pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1865         pci_nvme_stats_write_read_update(req->sc, req->opc,
1866             req->bytes, status);
1867         pci_nvme_release_ioreq(req->sc, req);
1868 }
1869
1870 /*
1871  * Implements the Flush command. The specification states:
1872  *    If a volatile write cache is not present, Flush commands complete
1873  *    successfully and have no effect
1874  * in the description of the Volatile Write Cache (VWC) field of the Identify
1875  * Controller data. Therefore, set status to Success if the command is
1876  * not supported (i.e. RAM or as indicated by the blockif).
1877  */
1878 static bool
1879 nvme_opc_flush(struct pci_nvme_softc *sc,
1880     struct nvme_command *cmd,
1881     struct pci_nvme_blockstore *nvstore,
1882     struct pci_nvme_ioreq *req,
1883     uint16_t *status)
1884 {
1885         bool pending = false;
1886
1887         if (nvstore->type == NVME_STOR_RAM) {
1888                 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1889         } else {
1890                 int err;
1891
1892                 req->io_req.br_callback = pci_nvme_io_done;
1893
1894                 err = blockif_flush(nvstore->ctx, &req->io_req);
1895                 switch (err) {
1896                 case 0:
1897                         pending = true;
1898                         break;
1899                 case EOPNOTSUPP:
1900                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1901                         break;
1902                 default:
1903                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1904                 }
1905         }
1906
1907         return (pending);
1908 }
1909
1910 static uint16_t
1911 nvme_write_read_ram(struct pci_nvme_softc *sc,
1912     struct pci_nvme_blockstore *nvstore,
1913     uint64_t prp1, uint64_t prp2,
1914     size_t offset, uint64_t bytes,
1915     bool is_write)
1916 {
1917         uint8_t *buf = nvstore->ctx;
1918         enum nvme_copy_dir dir;
1919         uint16_t status;
1920
1921         if (is_write)
1922                 dir = NVME_COPY_TO_PRP;
1923         else
1924                 dir = NVME_COPY_FROM_PRP;
1925
1926         if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
1927             buf + offset, bytes, dir))
1928                 pci_nvme_status_genc(&status,
1929                     NVME_SC_DATA_TRANSFER_ERROR);
1930         else
1931                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1932
1933         return (status);
1934 }
1935
1936 static uint16_t
1937 nvme_write_read_blockif(struct pci_nvme_softc *sc,
1938     struct pci_nvme_blockstore *nvstore,
1939     struct pci_nvme_ioreq *req,
1940     uint64_t prp1, uint64_t prp2,
1941     size_t offset, uint64_t bytes,
1942     bool is_write)
1943 {
1944         uint64_t size;
1945         int err;
1946         uint16_t status = NVME_NO_STATUS;
1947
1948         size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
1949         if (pci_nvme_append_iov_req(sc, req, prp1,
1950             size, is_write, offset)) {
1951                 pci_nvme_status_genc(&status,
1952                     NVME_SC_DATA_TRANSFER_ERROR);
1953                 goto out;
1954         }
1955
1956         offset += size;
1957         bytes  -= size;
1958
1959         if (bytes == 0) {
1960                 ;
1961         } else if (bytes <= PAGE_SIZE) {
1962                 size = bytes;
1963                 if (pci_nvme_append_iov_req(sc, req, prp2,
1964                     size, is_write, offset)) {
1965                         pci_nvme_status_genc(&status,
1966                             NVME_SC_DATA_TRANSFER_ERROR);
1967                         goto out;
1968                 }
1969         } else {
1970                 void *vmctx = sc->nsc_pi->pi_vmctx;
1971                 uint64_t *prp_list = &prp2;
1972                 uint64_t *last = prp_list;
1973
1974                 /* PRP2 is pointer to a physical region page list */
1975                 while (bytes) {
1976                         /* Last entry in list points to the next list */
1977                         if (prp_list == last) {
1978                                 uint64_t prp = *prp_list;
1979
1980                                 prp_list = paddr_guest2host(vmctx, prp,
1981                                     PAGE_SIZE - (prp % PAGE_SIZE));
1982                                 last = prp_list + (NVME_PRP2_ITEMS - 1);
1983                         }
1984
1985                         size = MIN(bytes, PAGE_SIZE);
1986
1987                         if (pci_nvme_append_iov_req(sc, req, *prp_list,
1988                             size, is_write, offset)) {
1989                                 pci_nvme_status_genc(&status,
1990                                     NVME_SC_DATA_TRANSFER_ERROR);
1991                                 goto out;
1992                         }
1993
1994                         offset += size;
1995                         bytes  -= size;
1996
1997                         prp_list++;
1998                 }
1999         }
2000         req->io_req.br_callback = pci_nvme_io_done;
2001         if (is_write)
2002                 err = blockif_write(nvstore->ctx, &req->io_req);
2003         else
2004                 err = blockif_read(nvstore->ctx, &req->io_req);
2005
2006         if (err)
2007                 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2008 out:
2009         return (status);
2010 }
2011
2012 static bool
2013 nvme_opc_write_read(struct pci_nvme_softc *sc,
2014     struct nvme_command *cmd,
2015     struct pci_nvme_blockstore *nvstore,
2016     struct pci_nvme_ioreq *req,
2017     uint16_t *status)
2018 {
2019         uint64_t lba, nblocks, bytes;
2020         size_t offset;
2021         bool is_write = cmd->opc == NVME_OPC_WRITE;
2022         bool pending = false;
2023
2024         lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2025         nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2026         if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2027                 WPRINTF("%s command would exceed LBA range", __func__);
2028                 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2029                 goto out;
2030         }
2031
2032         bytes  = nblocks << nvstore->sectsz_bits;
2033         if (bytes > NVME_MAX_DATA_SIZE) {
2034                 WPRINTF("%s command would exceed MDTS", __func__);
2035                 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2036                 goto out;
2037         }
2038
2039         offset = lba << nvstore->sectsz_bits;
2040
2041         req->bytes = bytes;
2042         req->io_req.br_offset = lba;
2043
2044         /* PRP bits 1:0 must be zero */
2045         cmd->prp1 &= ~0x3UL;
2046         cmd->prp2 &= ~0x3UL;
2047
2048         if (nvstore->type == NVME_STOR_RAM) {
2049                 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2050                     cmd->prp2, offset, bytes, is_write);
2051         } else {
2052                 *status = nvme_write_read_blockif(sc, nvstore, req,
2053                     cmd->prp1, cmd->prp2, offset, bytes, is_write);
2054
2055                 if (*status == NVME_NO_STATUS)
2056                         pending = true;
2057         }
2058 out:
2059         if (!pending)
2060                 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2061
2062         return (pending);
2063 }
2064
2065 static void
2066 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2067 {
2068         struct pci_nvme_ioreq *req = br->br_param;
2069         struct pci_nvme_softc *sc = req->sc;
2070         bool done = true;
2071         uint16_t status;
2072
2073         if (err) {
2074                 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2075         } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2076                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2077         } else {
2078                 struct iovec *iov = req->io_req.br_iov;
2079
2080                 req->prev_gpaddr++;
2081                 iov += req->prev_gpaddr;
2082
2083                 /* The iov_* values already include the sector size */
2084                 req->io_req.br_offset = (off_t)iov->iov_base;
2085                 req->io_req.br_resid = iov->iov_len;
2086                 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2087                         pci_nvme_status_genc(&status,
2088                             NVME_SC_INTERNAL_DEVICE_ERROR);
2089                 } else
2090                         done = false;
2091         }
2092
2093         if (done) {
2094                 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2095                     req->cid, 0, status);
2096                 pci_nvme_release_ioreq(sc, req);
2097         }
2098 }
2099
2100 static bool
2101 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2102     struct nvme_command *cmd,
2103     struct pci_nvme_blockstore *nvstore,
2104     struct pci_nvme_ioreq *req,
2105     uint16_t *status)
2106 {
2107         struct nvme_dsm_range *range;
2108         uint32_t nr, r, non_zero, dr;
2109         int err;
2110         bool pending = false;
2111
2112         if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2113                 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2114                 goto out;
2115         }
2116
2117         nr = cmd->cdw10 & 0xff;
2118
2119         /* copy locally because a range entry could straddle PRPs */
2120         range = calloc(1, NVME_MAX_DSM_TRIM);
2121         if (range == NULL) {
2122                 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2123                 goto out;
2124         }
2125         nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2126             (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2127
2128         /* Check for invalid ranges and the number of non-zero lengths */
2129         non_zero = 0;
2130         for (r = 0; r <= nr; r++) {
2131                 if (pci_nvme_out_of_range(nvstore,
2132                     range[r].starting_lba, range[r].length)) {
2133                         pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2134                         goto out;
2135                 }
2136                 if (range[r].length != 0)
2137                         non_zero++;
2138         }
2139
2140         if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2141                 size_t offset, bytes;
2142                 int sectsz_bits = sc->nvstore.sectsz_bits;
2143
2144                 /*
2145                  * DSM calls are advisory only, and compliant controllers
2146                  * may choose to take no actions (i.e. return Success).
2147                  */
2148                 if (!nvstore->deallocate) {
2149                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2150                         goto out;
2151                 }
2152
2153                 /* If all ranges have a zero length, return Success */
2154                 if (non_zero == 0) {
2155                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2156                         goto out;
2157                 }
2158
2159                 if (req == NULL) {
2160                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2161                         goto out;
2162                 }
2163
2164                 offset = range[0].starting_lba << sectsz_bits;
2165                 bytes = range[0].length << sectsz_bits;
2166
2167                 /*
2168                  * If the request is for more than a single range, store
2169                  * the ranges in the br_iov. Optimize for the common case
2170                  * of a single range.
2171                  *
2172                  * Note that NVMe Number of Ranges is a zero based value
2173                  */
2174                 req->io_req.br_iovcnt = 0;
2175                 req->io_req.br_offset = offset;
2176                 req->io_req.br_resid = bytes;
2177
2178                 if (nr == 0) {
2179                         req->io_req.br_callback = pci_nvme_io_done;
2180                 } else {
2181                         struct iovec *iov = req->io_req.br_iov;
2182
2183                         for (r = 0, dr = 0; r <= nr; r++) {
2184                                 offset = range[r].starting_lba << sectsz_bits;
2185                                 bytes = range[r].length << sectsz_bits;
2186                                 if (bytes == 0)
2187                                         continue;
2188
2189                                 if ((nvstore->size - offset) < bytes) {
2190                                         pci_nvme_status_genc(status,
2191                                             NVME_SC_LBA_OUT_OF_RANGE);
2192                                         goto out;
2193                                 }
2194                                 iov[dr].iov_base = (void *)offset;
2195                                 iov[dr].iov_len = bytes;
2196                                 dr++;
2197                         }
2198                         req->io_req.br_callback = pci_nvme_dealloc_sm;
2199
2200                         /*
2201                          * Use prev_gpaddr to track the current entry and
2202                          * prev_size to track the number of entries
2203                          */
2204                         req->prev_gpaddr = 0;
2205                         req->prev_size = dr;
2206                 }
2207
2208                 err = blockif_delete(nvstore->ctx, &req->io_req);
2209                 if (err)
2210                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2211                 else
2212                         pending = true;
2213         }
2214 out:
2215         free(range);
2216         return (pending);
2217 }
2218
2219 static void
2220 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2221 {
2222         struct nvme_submission_queue *sq;
2223         uint16_t status;
2224         uint16_t sqhead;
2225
2226         /* handle all submissions up to sq->tail index */
2227         sq = &sc->submit_queues[idx];
2228
2229         pthread_mutex_lock(&sq->mtx);
2230
2231         sqhead = sq->head;
2232         DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2233                  idx, sqhead, sq->tail, sq->qbase);
2234
2235         while (sqhead != atomic_load_acq_short(&sq->tail)) {
2236                 struct nvme_command *cmd;
2237                 struct pci_nvme_ioreq *req;
2238                 uint32_t nsid;
2239                 bool pending;
2240
2241                 pending = false;
2242                 req = NULL;
2243                 status = 0;
2244
2245                 cmd = &sq->qbase[sqhead];
2246                 sqhead = (sqhead + 1) % sq->size;
2247
2248                 nsid = le32toh(cmd->nsid);
2249                 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2250                         pci_nvme_status_genc(&status,
2251                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2252                         status |=
2253                             NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2254                         goto complete;
2255                 }
2256
2257                 req = pci_nvme_get_ioreq(sc);
2258                 if (req == NULL) {
2259                         pci_nvme_status_genc(&status,
2260                             NVME_SC_INTERNAL_DEVICE_ERROR);
2261                         WPRINTF("%s: unable to allocate IO req", __func__);
2262                         goto complete;
2263                 }
2264                 req->nvme_sq = sq;
2265                 req->sqid = idx;
2266                 req->opc = cmd->opc;
2267                 req->cid = cmd->cid;
2268                 req->nsid = cmd->nsid;
2269
2270                 switch (cmd->opc) {
2271                 case NVME_OPC_FLUSH:
2272                         pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2273                             req, &status);
2274                         break;
2275                 case NVME_OPC_WRITE:
2276                 case NVME_OPC_READ:
2277                         pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2278                             req, &status);
2279                         break;
2280                 case NVME_OPC_WRITE_ZEROES:
2281                         /* TODO: write zeroes
2282                         WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2283                                 __func__, lba, cmd->cdw12 & 0xFFFF); */
2284                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2285                         break;
2286                 case NVME_OPC_DATASET_MANAGEMENT:
2287                         pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2288                             req, &status);
2289                         break;
2290                 default:
2291                         WPRINTF("%s unhandled io command 0x%x",
2292                             __func__, cmd->opc);
2293                         pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2294                 }
2295 complete:
2296                 if (!pending) {
2297                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2298                             status);
2299                         if (req != NULL)
2300                                 pci_nvme_release_ioreq(sc, req);
2301                 }
2302         }
2303
2304         sq->head = sqhead;
2305
2306         pthread_mutex_unlock(&sq->mtx);
2307 }
2308
2309 static void
2310 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2311         uint64_t idx, int is_sq, uint64_t value)
2312 {
2313         DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2314                 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2315
2316         if (is_sq) {
2317                 if (idx > sc->num_squeues) {
2318                         WPRINTF("%s queue index %lu overflow from "
2319                                  "guest (max %u)",
2320                                  __func__, idx, sc->num_squeues);
2321                         return;
2322                 }
2323
2324                 atomic_store_short(&sc->submit_queues[idx].tail,
2325                                    (uint16_t)value);
2326
2327                 if (idx == 0) {
2328                         pci_nvme_handle_admin_cmd(sc, value);
2329                 } else {
2330                         /* submission queue; handle new entries in SQ */
2331                         if (idx > sc->num_squeues) {
2332                                 WPRINTF("%s SQ index %lu overflow from "
2333                                          "guest (max %u)",
2334                                          __func__, idx, sc->num_squeues);
2335                                 return;
2336                         }
2337                         pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2338                 }
2339         } else {
2340                 if (idx > sc->num_cqueues) {
2341                         WPRINTF("%s queue index %lu overflow from "
2342                                  "guest (max %u)",
2343                                  __func__, idx, sc->num_cqueues);
2344                         return;
2345                 }
2346
2347                 atomic_store_short(&sc->compl_queues[idx].head,
2348                                 (uint16_t)value);
2349         }
2350 }
2351
2352 static void
2353 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2354 {
2355         const char *s = iswrite ? "WRITE" : "READ";
2356
2357         switch (offset) {
2358         case NVME_CR_CAP_LOW:
2359                 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2360                 break;
2361         case NVME_CR_CAP_HI:
2362                 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2363                 break;
2364         case NVME_CR_VS:
2365                 DPRINTF("%s %s NVME_CR_VS", func, s);
2366                 break;
2367         case NVME_CR_INTMS:
2368                 DPRINTF("%s %s NVME_CR_INTMS", func, s);
2369                 break;
2370         case NVME_CR_INTMC:
2371                 DPRINTF("%s %s NVME_CR_INTMC", func, s);
2372                 break;
2373         case NVME_CR_CC:
2374                 DPRINTF("%s %s NVME_CR_CC", func, s);
2375                 break;
2376         case NVME_CR_CSTS:
2377                 DPRINTF("%s %s NVME_CR_CSTS", func, s);
2378                 break;
2379         case NVME_CR_NSSR:
2380                 DPRINTF("%s %s NVME_CR_NSSR", func, s);
2381                 break;
2382         case NVME_CR_AQA:
2383                 DPRINTF("%s %s NVME_CR_AQA", func, s);
2384                 break;
2385         case NVME_CR_ASQ_LOW:
2386                 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2387                 break;
2388         case NVME_CR_ASQ_HI:
2389                 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2390                 break;
2391         case NVME_CR_ACQ_LOW:
2392                 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2393                 break;
2394         case NVME_CR_ACQ_HI:
2395                 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2396                 break;
2397         default:
2398                 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2399         }
2400
2401 }
2402
2403 static void
2404 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2405         uint64_t offset, int size, uint64_t value)
2406 {
2407         uint32_t ccreg;
2408
2409         if (offset >= NVME_DOORBELL_OFFSET) {
2410                 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2411                 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2412                 int is_sq = (belloffset % 8) < 4;
2413
2414                 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2415                         WPRINTF("guest attempted an overflow write offset "
2416                                  "0x%lx, val 0x%lx in %s",
2417                                  offset, value, __func__);
2418                         return;
2419                 }
2420
2421                 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2422                 return;
2423         }
2424
2425         DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2426                 offset, size, value);
2427
2428         if (size != 4) {
2429                 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2430                          "val 0x%lx) to bar0 in %s",
2431                          size, offset, value, __func__);
2432                 /* TODO: shutdown device */
2433                 return;
2434         }
2435
2436         pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2437
2438         pthread_mutex_lock(&sc->mtx);
2439
2440         switch (offset) {
2441         case NVME_CR_CAP_LOW:
2442         case NVME_CR_CAP_HI:
2443                 /* readonly */
2444                 break;
2445         case NVME_CR_VS:
2446                 /* readonly */
2447                 break;
2448         case NVME_CR_INTMS:
2449                 /* MSI-X, so ignore */
2450                 break;
2451         case NVME_CR_INTMC:
2452                 /* MSI-X, so ignore */
2453                 break;
2454         case NVME_CR_CC:
2455                 ccreg = (uint32_t)value;
2456
2457                 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2458                          "iocqes %u",
2459                         __func__,
2460                          NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2461                          NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2462                          NVME_CC_GET_IOCQES(ccreg));
2463
2464                 if (NVME_CC_GET_SHN(ccreg)) {
2465                         /* perform shutdown - flush out data to backend */
2466                         sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2467                             NVME_CSTS_REG_SHST_SHIFT);
2468                         sc->regs.csts |= NVME_SHST_COMPLETE <<
2469                             NVME_CSTS_REG_SHST_SHIFT;
2470                 }
2471                 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2472                         if (NVME_CC_GET_EN(ccreg) == 0)
2473                                 /* transition 1-> causes controller reset */
2474                                 pci_nvme_reset_locked(sc);
2475                         else
2476                                 pci_nvme_init_controller(ctx, sc);
2477                 }
2478
2479                 /* Insert the iocqes, iosqes and en bits from the write */
2480                 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2481                 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2482                 if (NVME_CC_GET_EN(ccreg) == 0) {
2483                         /* Insert the ams, mps and css bit fields */
2484                         sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2485                         sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2486                         sc->regs.csts &= ~NVME_CSTS_RDY;
2487                 } else if (sc->pending_ios == 0) {
2488                         sc->regs.csts |= NVME_CSTS_RDY;
2489                 }
2490                 break;
2491         case NVME_CR_CSTS:
2492                 break;
2493         case NVME_CR_NSSR:
2494                 /* ignore writes; don't support subsystem reset */
2495                 break;
2496         case NVME_CR_AQA:
2497                 sc->regs.aqa = (uint32_t)value;
2498                 break;
2499         case NVME_CR_ASQ_LOW:
2500                 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2501                                (0xFFFFF000 & value);
2502                 break;
2503         case NVME_CR_ASQ_HI:
2504                 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2505                                (value << 32);
2506                 break;
2507         case NVME_CR_ACQ_LOW:
2508                 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2509                                (0xFFFFF000 & value);
2510                 break;
2511         case NVME_CR_ACQ_HI:
2512                 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2513                                (value << 32);
2514                 break;
2515         default:
2516                 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2517                          __func__, offset, value, size);
2518         }
2519         pthread_mutex_unlock(&sc->mtx);
2520 }
2521
2522 static void
2523 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2524                 int baridx, uint64_t offset, int size, uint64_t value)
2525 {
2526         struct pci_nvme_softc* sc = pi->pi_arg;
2527
2528         if (baridx == pci_msix_table_bar(pi) ||
2529             baridx == pci_msix_pba_bar(pi)) {
2530                 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2531                          " value 0x%lx", baridx, offset, size, value);
2532
2533                 pci_emul_msix_twrite(pi, offset, size, value);
2534                 return;
2535         }
2536
2537         switch (baridx) {
2538         case 0:
2539                 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2540                 break;
2541
2542         default:
2543                 DPRINTF("%s unknown baridx %d, val 0x%lx",
2544                          __func__, baridx, value);
2545         }
2546 }
2547
2548 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2549         uint64_t offset, int size)
2550 {
2551         uint64_t value;
2552
2553         pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2554
2555         if (offset < NVME_DOORBELL_OFFSET) {
2556                 void *p = &(sc->regs);
2557                 pthread_mutex_lock(&sc->mtx);
2558                 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2559                 pthread_mutex_unlock(&sc->mtx);
2560         } else {
2561                 value = 0;
2562                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2563         }
2564
2565         switch (size) {
2566         case 1:
2567                 value &= 0xFF;
2568                 break;
2569         case 2:
2570                 value &= 0xFFFF;
2571                 break;
2572         case 4:
2573                 value &= 0xFFFFFFFF;
2574                 break;
2575         }
2576
2577         DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
2578                  offset, size, (uint32_t)value);
2579
2580         return (value);
2581 }
2582
2583
2584
2585 static uint64_t
2586 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2587     uint64_t offset, int size)
2588 {
2589         struct pci_nvme_softc* sc = pi->pi_arg;
2590
2591         if (baridx == pci_msix_table_bar(pi) ||
2592             baridx == pci_msix_pba_bar(pi)) {
2593                 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2594                         baridx, offset, size);
2595
2596                 return pci_emul_msix_tread(pi, offset, size);
2597         }
2598
2599         switch (baridx) {
2600         case 0:
2601                 return pci_nvme_read_bar_0(sc, offset, size);
2602
2603         default:
2604                 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2605         }
2606
2607         return (0);
2608 }
2609
2610
2611 static int
2612 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2613 {
2614         char bident[sizeof("XX:X:X")];
2615         char    *uopt, *xopts, *config;
2616         uint32_t sectsz;
2617         int optidx;
2618
2619         sc->max_queues = NVME_QUEUES;
2620         sc->max_qentries = NVME_MAX_QENTRIES;
2621         sc->ioslots = NVME_IOSLOTS;
2622         sc->num_squeues = sc->max_queues;
2623         sc->num_cqueues = sc->max_queues;
2624         sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2625         sectsz = 0;
2626
2627         uopt = strdup(opts);
2628         optidx = 0;
2629         snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2630                  "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2631         for (xopts = strtok(uopt, ",");
2632              xopts != NULL;
2633              xopts = strtok(NULL, ",")) {
2634
2635                 if ((config = strchr(xopts, '=')) != NULL)
2636                         *config++ = '\0';
2637
2638                 if (!strcmp("maxq", xopts)) {
2639                         sc->max_queues = atoi(config);
2640                 } else if (!strcmp("qsz", xopts)) {
2641                         sc->max_qentries = atoi(config);
2642                 } else if (!strcmp("ioslots", xopts)) {
2643                         sc->ioslots = atoi(config);
2644                 } else if (!strcmp("sectsz", xopts)) {
2645                         sectsz = atoi(config);
2646                 } else if (!strcmp("ser", xopts)) {
2647                         /*
2648                          * This field indicates the Product Serial Number in
2649                          * 7-bit ASCII, unused bytes should be space characters.
2650                          * Ref: NVMe v1.3c.
2651                          */
2652                         cpywithpad((char *)sc->ctrldata.sn,
2653                                    sizeof(sc->ctrldata.sn), config, ' ');
2654                 } else if (!strcmp("ram", xopts)) {
2655                         uint64_t sz = strtoull(&xopts[4], NULL, 10);
2656
2657                         sc->nvstore.type = NVME_STOR_RAM;
2658                         sc->nvstore.size = sz * 1024 * 1024;
2659                         sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2660                         sc->nvstore.sectsz = 4096;
2661                         sc->nvstore.sectsz_bits = 12;
2662                         if (sc->nvstore.ctx == NULL) {
2663                                 perror("Unable to allocate RAM");
2664                                 free(uopt);
2665                                 return (-1);
2666                         }
2667                 } else if (!strcmp("eui64", xopts)) {
2668                         sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2669                 } else if (!strcmp("dsm", xopts)) {
2670                         if (!strcmp("auto", config))
2671                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2672                         else if (!strcmp("enable", config))
2673                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2674                         else if (!strcmp("disable", config))
2675                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2676                 } else if (optidx == 0) {
2677                         snprintf(bident, sizeof(bident), "%d:%d",
2678                                  sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2679                         sc->nvstore.ctx = blockif_open(xopts, bident);
2680                         if (sc->nvstore.ctx == NULL) {
2681                                 perror("Could not open backing file");
2682                                 free(uopt);
2683                                 return (-1);
2684                         }
2685                         sc->nvstore.type = NVME_STOR_BLOCKIF;
2686                         sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2687                 } else {
2688                         EPRINTLN("Invalid option %s", xopts);
2689                         free(uopt);
2690                         return (-1);
2691                 }
2692
2693                 optidx++;
2694         }
2695         free(uopt);
2696
2697         if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2698                 EPRINTLN("backing store not specified");
2699                 return (-1);
2700         }
2701         if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2702                 sc->nvstore.sectsz = sectsz;
2703         else if (sc->nvstore.type != NVME_STOR_RAM)
2704                 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2705         for (sc->nvstore.sectsz_bits = 9;
2706              (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2707              sc->nvstore.sectsz_bits++);
2708
2709         if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2710                 sc->max_queues = NVME_QUEUES;
2711
2712         if (sc->max_qentries <= 0) {
2713                 EPRINTLN("Invalid qsz option");
2714                 return (-1);
2715         }
2716         if (sc->ioslots <= 0) {
2717                 EPRINTLN("Invalid ioslots option");
2718                 return (-1);
2719         }
2720
2721         return (0);
2722 }
2723
2724 static int
2725 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2726 {
2727         struct pci_nvme_softc *sc;
2728         uint32_t pci_membar_sz;
2729         int     error;
2730
2731         error = 0;
2732
2733         sc = calloc(1, sizeof(struct pci_nvme_softc));
2734         pi->pi_arg = sc;
2735         sc->nsc_pi = pi;
2736
2737         error = pci_nvme_parse_opts(sc, opts);
2738         if (error < 0)
2739                 goto done;
2740         else
2741                 error = 0;
2742
2743         STAILQ_INIT(&sc->ioreqs_free);
2744         sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2745         for (int i = 0; i < sc->ioslots; i++) {
2746                 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2747         }
2748
2749         pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2750         pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2751         pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2752         pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2753         pci_set_cfgdata8(pi, PCIR_PROGIF,
2754                          PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2755
2756         /*
2757          * Allocate size of NVMe registers + doorbell space for all queues.
2758          *
2759          * The specification requires a minimum memory I/O window size of 16K.
2760          * The Windows driver will refuse to start a device with a smaller
2761          * window.
2762          */
2763         pci_membar_sz = sizeof(struct nvme_registers) +
2764             2 * sizeof(uint32_t) * (sc->max_queues + 1);
2765         pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2766
2767         DPRINTF("nvme membar size: %u", pci_membar_sz);
2768
2769         error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2770         if (error) {
2771                 WPRINTF("%s pci alloc mem bar failed", __func__);
2772                 goto done;
2773         }
2774
2775         error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2776         if (error) {
2777                 WPRINTF("%s pci add msixcap failed", __func__);
2778                 goto done;
2779         }
2780
2781         error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2782         if (error) {
2783                 WPRINTF("%s pci add Express capability failed", __func__);
2784                 goto done;
2785         }
2786
2787         pthread_mutex_init(&sc->mtx, NULL);
2788         sem_init(&sc->iosemlock, 0, sc->ioslots);
2789
2790         pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2791         /*
2792          * Controller data depends on Namespace data so initialize Namespace
2793          * data first.
2794          */
2795         pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2796         pci_nvme_init_ctrldata(sc);
2797         pci_nvme_init_logpages(sc);
2798         pci_nvme_init_features(sc);
2799
2800         pci_nvme_aer_init(sc);
2801
2802         pci_nvme_reset(sc);
2803
2804         pci_lintr_request(pi);
2805
2806 done:
2807         return (error);
2808 }
2809
2810
2811 struct pci_devemu pci_de_nvme = {
2812         .pe_emu =       "nvme",
2813         .pe_init =      pci_nvme_init,
2814         .pe_barwrite =  pci_nvme_write,
2815         .pe_barread =   pci_nvme_read
2816 };
2817 PCI_EMUL_SET(pci_de_nvme);