]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/bhyve/pci_nvme.c
ping(8): Fix a mandoc related issue
[FreeBSD/FreeBSD.git] / usr.sbin / bhyve / pci_nvme.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov 
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65
66 #include <assert.h>
67 #include <pthread.h>
68 #include <semaphore.h>
69 #include <stdbool.h>
70 #include <stddef.h>
71 #include <stdint.h>
72 #include <stdio.h>
73 #include <stdlib.h>
74 #include <string.h>
75
76 #include <machine/atomic.h>
77 #include <machine/vmm.h>
78 #include <vmmapi.h>
79
80 #include <dev/nvme/nvme.h>
81
82 #include "bhyverun.h"
83 #include "block_if.h"
84 #include "debug.h"
85 #include "pci_emul.h"
86
87
88 static int nvme_debug = 0;
89 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
90 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
91
92 /* defaults; can be overridden */
93 #define NVME_MSIX_BAR           4
94
95 #define NVME_IOSLOTS            8
96
97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
98 #define NVME_MMIO_SPACE_MIN     (1 << 14)
99
100 #define NVME_QUEUES             16
101 #define NVME_MAX_QENTRIES       2048
102 /* Memory Page size Minimum reported in CAP register */
103 #define NVME_MPSMIN             0
104 /* MPSMIN converted to bytes */
105 #define NVME_MPSMIN_BYTES       (1 << (12 + NVME_MPSMIN))
106
107 #define NVME_PRP2_ITEMS         (PAGE_SIZE/sizeof(uint64_t))
108 #define NVME_MDTS               9
109 /* Note the + 1 allows for the initial descriptor to not be page aligned */
110 #define NVME_MAX_IOVEC          ((1 << NVME_MDTS) + 1)
111 #define NVME_MAX_DATA_SIZE      ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
112
113 /* This is a synthetic status code to indicate there is no status */
114 #define NVME_NO_STATUS          0xffff
115 #define NVME_COMPLETION_VALID(c)        ((c).status != NVME_NO_STATUS)
116
117 /* helpers */
118
119 /* Convert a zero-based value into a one-based value */
120 #define ONE_BASED(zero)         ((zero) + 1)
121 /* Convert a one-based value into a zero-based value */
122 #define ZERO_BASED(one)         ((one)  - 1)
123
124 /* Encode number of SQ's and CQ's for Set/Get Features */
125 #define NVME_FEATURE_NUM_QUEUES(sc) \
126         (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
127         (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
128
129 #define NVME_DOORBELL_OFFSET    offsetof(struct nvme_registers, doorbell)
130
131 enum nvme_controller_register_offsets {
132         NVME_CR_CAP_LOW = 0x00,
133         NVME_CR_CAP_HI  = 0x04,
134         NVME_CR_VS      = 0x08,
135         NVME_CR_INTMS   = 0x0c,
136         NVME_CR_INTMC   = 0x10,
137         NVME_CR_CC      = 0x14,
138         NVME_CR_CSTS    = 0x1c,
139         NVME_CR_NSSR    = 0x20,
140         NVME_CR_AQA     = 0x24,
141         NVME_CR_ASQ_LOW = 0x28,
142         NVME_CR_ASQ_HI  = 0x2c,
143         NVME_CR_ACQ_LOW = 0x30,
144         NVME_CR_ACQ_HI  = 0x34,
145 };
146
147 enum nvme_cmd_cdw11 {
148         NVME_CMD_CDW11_PC  = 0x0001,
149         NVME_CMD_CDW11_IEN = 0x0002,
150         NVME_CMD_CDW11_IV  = 0xFFFF0000,
151 };
152
153 enum nvme_copy_dir {
154         NVME_COPY_TO_PRP,
155         NVME_COPY_FROM_PRP,
156 };
157
158 #define NVME_CQ_INTEN   0x01
159 #define NVME_CQ_INTCOAL 0x02
160
161 struct nvme_completion_queue {
162         struct nvme_completion *qbase;
163         pthread_mutex_t mtx;
164         uint32_t        size;
165         uint16_t        tail; /* nvme progress */
166         uint16_t        head; /* guest progress */
167         uint16_t        intr_vec;
168         uint32_t        intr_en;
169 };
170
171 struct nvme_submission_queue {
172         struct nvme_command *qbase;
173         pthread_mutex_t mtx;
174         uint32_t        size;
175         uint16_t        head; /* nvme progress */
176         uint16_t        tail; /* guest progress */
177         uint16_t        cqid; /* completion queue id */
178         int             qpriority;
179 };
180
181 enum nvme_storage_type {
182         NVME_STOR_BLOCKIF = 0,
183         NVME_STOR_RAM = 1,
184 };
185
186 struct pci_nvme_blockstore {
187         enum nvme_storage_type type;
188         void            *ctx;
189         uint64_t        size;
190         uint32_t        sectsz;
191         uint32_t        sectsz_bits;
192         uint64_t        eui64;
193         uint32_t        deallocate:1;
194 };
195
196 /*
197  * Calculate the number of additional page descriptors for guest IO requests
198  * based on the advertised Max Data Transfer (MDTS) and given the number of
199  * default iovec's in a struct blockif_req.
200  *
201  * Note the + 1 allows for the initial descriptor to not be page aligned.
202  */
203 #define MDTS_PAD_SIZE \
204         NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
205         NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
206         0
207
208 struct pci_nvme_ioreq {
209         struct pci_nvme_softc *sc;
210         STAILQ_ENTRY(pci_nvme_ioreq) link;
211         struct nvme_submission_queue *nvme_sq;
212         uint16_t        sqid;
213
214         /* command information */
215         uint16_t        opc;
216         uint16_t        cid;
217         uint32_t        nsid;
218
219         uint64_t        prev_gpaddr;
220         size_t          prev_size;
221         size_t          bytes;
222
223         struct blockif_req io_req;
224
225         struct iovec    iovpadding[MDTS_PAD_SIZE];
226 };
227
228 enum nvme_dsm_type {
229         /* Dataset Management bit in ONCS reflects backing storage capability */
230         NVME_DATASET_MANAGEMENT_AUTO,
231         /* Unconditionally set Dataset Management bit in ONCS */
232         NVME_DATASET_MANAGEMENT_ENABLE,
233         /* Unconditionally clear Dataset Management bit in ONCS */
234         NVME_DATASET_MANAGEMENT_DISABLE,
235 };
236
237 struct pci_nvme_softc;
238 struct nvme_feature_obj;
239
240 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
241     struct nvme_feature_obj *,
242     struct nvme_command *,
243     struct nvme_completion *);
244
245 struct nvme_feature_obj {
246         uint32_t        cdw11;
247         nvme_feature_cb set;
248         nvme_feature_cb get;
249         bool namespace_specific;
250 };
251
252 #define NVME_FID_MAX            (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
253
254 struct pci_nvme_aer {
255         STAILQ_ENTRY(pci_nvme_aer) link;
256         uint16_t        cid;    /* Command ID of the submitted AER */
257 };
258
259 struct pci_nvme_softc {
260         struct pci_devinst *nsc_pi;
261
262         pthread_mutex_t mtx;
263
264         struct nvme_registers regs;
265
266         struct nvme_namespace_data  nsdata;
267         struct nvme_controller_data ctrldata;
268         struct nvme_error_information_entry err_log;
269         struct nvme_health_information_page health_log;
270         struct nvme_firmware_page fw_log;
271
272         struct pci_nvme_blockstore nvstore;
273
274         uint16_t        max_qentries;   /* max entries per queue */
275         uint32_t        max_queues;     /* max number of IO SQ's or CQ's */
276         uint32_t        num_cqueues;
277         uint32_t        num_squeues;
278         bool            num_q_is_set; /* Has host set Number of Queues */
279
280         struct pci_nvme_ioreq *ioreqs;
281         STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
282         uint32_t        pending_ios;
283         uint32_t        ioslots;
284         sem_t           iosemlock;
285
286         /*
287          * Memory mapped Submission and Completion queues
288          * Each array includes both Admin and IO queues
289          */
290         struct nvme_completion_queue *compl_queues;
291         struct nvme_submission_queue *submit_queues;
292
293         struct nvme_feature_obj feat[NVME_FID_MAX];
294
295         enum nvme_dsm_type dataset_management;
296
297         /* Accounting for SMART data */
298         __uint128_t     read_data_units;
299         __uint128_t     write_data_units;
300         __uint128_t     read_commands;
301         __uint128_t     write_commands;
302         uint32_t        read_dunits_remainder;
303         uint32_t        write_dunits_remainder;
304
305         STAILQ_HEAD(, pci_nvme_aer) aer_list;
306         uint32_t        aer_count;
307 };
308
309
310 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
311 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
312 static void pci_nvme_io_done(struct blockif_req *, int);
313
314 /* Controller Configuration utils */
315 #define NVME_CC_GET_EN(cc) \
316         ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
317 #define NVME_CC_GET_CSS(cc) \
318         ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
319 #define NVME_CC_GET_SHN(cc) \
320         ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
321 #define NVME_CC_GET_IOSQES(cc) \
322         ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
323 #define NVME_CC_GET_IOCQES(cc) \
324         ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
325
326 #define NVME_CC_WRITE_MASK \
327         ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
328          (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
329          (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
330
331 #define NVME_CC_NEN_WRITE_MASK \
332         ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
333          (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
334          (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
335
336 /* Controller Status utils */
337 #define NVME_CSTS_GET_RDY(sts) \
338         ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
339
340 #define NVME_CSTS_RDY   (1 << NVME_CSTS_REG_RDY_SHIFT)
341
342 /* Completion Queue status word utils */
343 #define NVME_STATUS_P   (1 << NVME_STATUS_P_SHIFT)
344 #define NVME_STATUS_MASK \
345         ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
346          (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
347
348 #define NVME_ONCS_DSM   (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
349         NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
350
351 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
352     struct nvme_feature_obj *,
353     struct nvme_command *,
354     struct nvme_completion *);
355 static void nvme_feature_num_queues(struct pci_nvme_softc *,
356     struct nvme_feature_obj *,
357     struct nvme_command *,
358     struct nvme_completion *);
359 static void nvme_feature_iv_config(struct pci_nvme_softc *,
360     struct nvme_feature_obj *,
361     struct nvme_command *,
362     struct nvme_completion *);
363
364 static __inline void
365 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
366 {
367         size_t len;
368
369         len = strnlen(src, dst_size);
370         memset(dst, pad, dst_size);
371         memcpy(dst, src, len);
372 }
373
374 static __inline void
375 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
376 {
377
378         *status &= ~NVME_STATUS_MASK;
379         *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
380                 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
381 }
382
383 static __inline void
384 pci_nvme_status_genc(uint16_t *status, uint16_t code)
385 {
386
387         pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
388 }
389
390 /*
391  * Initialize the requested number or IO Submission and Completion Queues.
392  * Admin queues are allocated implicitly.
393  */
394 static void
395 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
396 {
397         uint32_t i;
398
399         /*
400          * Allocate and initialize the Submission Queues
401          */
402         if (nsq > NVME_QUEUES) {
403                 WPRINTF("%s: clamping number of SQ from %u to %u",
404                                         __func__, nsq, NVME_QUEUES);
405                 nsq = NVME_QUEUES;
406         }
407
408         sc->num_squeues = nsq;
409
410         sc->submit_queues = calloc(sc->num_squeues + 1,
411                                 sizeof(struct nvme_submission_queue));
412         if (sc->submit_queues == NULL) {
413                 WPRINTF("%s: SQ allocation failed", __func__);
414                 sc->num_squeues = 0;
415         } else {
416                 struct nvme_submission_queue *sq = sc->submit_queues;
417
418                 for (i = 0; i < sc->num_squeues; i++)
419                         pthread_mutex_init(&sq[i].mtx, NULL);
420         }
421
422         /*
423          * Allocate and initialize the Completion Queues
424          */
425         if (ncq > NVME_QUEUES) {
426                 WPRINTF("%s: clamping number of CQ from %u to %u",
427                                         __func__, ncq, NVME_QUEUES);
428                 ncq = NVME_QUEUES;
429         }
430
431         sc->num_cqueues = ncq;
432
433         sc->compl_queues = calloc(sc->num_cqueues + 1,
434                                 sizeof(struct nvme_completion_queue));
435         if (sc->compl_queues == NULL) {
436                 WPRINTF("%s: CQ allocation failed", __func__);
437                 sc->num_cqueues = 0;
438         } else {
439                 struct nvme_completion_queue *cq = sc->compl_queues;
440
441                 for (i = 0; i < sc->num_cqueues; i++)
442                         pthread_mutex_init(&cq[i].mtx, NULL);
443         }
444 }
445
446 static void
447 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
448 {
449         struct nvme_controller_data *cd = &sc->ctrldata;
450
451         cd->vid = 0xFB5D;
452         cd->ssvid = 0x0000;
453
454         cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
455         cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
456
457         /* Num of submission commands that we can handle at a time (2^rab) */
458         cd->rab   = 4;
459
460         /* FreeBSD OUI */
461         cd->ieee[0] = 0x58;
462         cd->ieee[1] = 0x9c;
463         cd->ieee[2] = 0xfc;
464
465         cd->mic = 0;
466
467         cd->mdts = NVME_MDTS;   /* max data transfer size (2^mdts * CAP.MPSMIN) */
468
469         cd->ver = 0x00010300;
470
471         cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
472         cd->acl = 2;
473         cd->aerl = 4;
474
475         /* Advertise 1, Read-only firmware slot */
476         cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
477             (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
478         cd->lpa = 0;    /* TODO: support some simple things like SMART */
479         cd->elpe = 0;   /* max error log page entries */
480         cd->npss = 1;   /* number of power states support */
481
482         /* Warning Composite Temperature Threshold */
483         cd->wctemp = 0x0157;
484
485         cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
486             (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
487         cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
488             (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
489         cd->nn = 1;     /* number of namespaces */
490
491         cd->oncs = 0;
492         switch (sc->dataset_management) {
493         case NVME_DATASET_MANAGEMENT_AUTO:
494                 if (sc->nvstore.deallocate)
495                         cd->oncs |= NVME_ONCS_DSM;
496                 break;
497         case NVME_DATASET_MANAGEMENT_ENABLE:
498                 cd->oncs |= NVME_ONCS_DSM;
499                 break;
500         default:
501                 break;
502         }
503
504         cd->fna = 0x03;
505
506         cd->power_state[0].mp = 10;
507 }
508
509 /*
510  * Calculate the CRC-16 of the given buffer
511  * See copyright attribution at top of file
512  */
513 static uint16_t
514 crc16(uint16_t crc, const void *buffer, unsigned int len)
515 {
516         const unsigned char *cp = buffer;
517         /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
518         static uint16_t const crc16_table[256] = {
519                 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
520                 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
521                 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
522                 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
523                 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
524                 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
525                 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
526                 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
527                 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
528                 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
529                 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
530                 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
531                 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
532                 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
533                 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
534                 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
535                 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
536                 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
537                 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
538                 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
539                 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
540                 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
541                 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
542                 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
543                 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
544                 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
545                 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
546                 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
547                 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
548                 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
549                 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
550                 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
551         };
552
553         while (len--)
554                 crc = (((crc >> 8) & 0xffU) ^
555                     crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
556         return crc;
557 }
558
559 static void
560 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
561     struct nvme_namespace_data *nd, uint32_t nsid,
562     struct pci_nvme_blockstore *nvstore)
563 {
564
565         /* Get capacity and block size information from backing store */
566         nd->nsze = nvstore->size / nvstore->sectsz;
567         nd->ncap = nd->nsze;
568         nd->nuse = nd->nsze;
569
570         if (nvstore->type == NVME_STOR_BLOCKIF)
571                 nvstore->deallocate = blockif_candelete(nvstore->ctx);
572
573         nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
574         nd->flbas = 0;
575
576         /* Create an EUI-64 if user did not provide one */
577         if (nvstore->eui64 == 0) {
578                 char *data = NULL;
579                 uint64_t eui64 = nvstore->eui64;
580
581                 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
582                     sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
583
584                 if (data != NULL) {
585                         eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
586                         free(data);
587                 }
588                 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
589         }
590         be64enc(nd->eui64, nvstore->eui64);
591
592         /* LBA data-sz = 2^lbads */
593         nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
594 }
595
596 static void
597 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
598 {
599
600         memset(&sc->err_log, 0, sizeof(sc->err_log));
601         memset(&sc->health_log, 0, sizeof(sc->health_log));
602         memset(&sc->fw_log, 0, sizeof(sc->fw_log));
603
604         /* Set read/write remainder to round up according to spec */
605         sc->read_dunits_remainder = 999;
606         sc->write_dunits_remainder = 999;
607
608         /* Set nominal Health values checked by implementations */
609         sc->health_log.temperature = 310;
610         sc->health_log.available_spare = 100;
611         sc->health_log.available_spare_threshold = 10;
612 }
613
614 static void
615 pci_nvme_init_features(struct pci_nvme_softc *sc)
616 {
617
618         sc->feat[0].set = nvme_feature_invalid_cb;
619         sc->feat[0].get = nvme_feature_invalid_cb;
620
621         sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
622         sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
623         sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
624         sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set =
625             nvme_feature_iv_config;
626         sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get =
627             nvme_feature_invalid_cb;
628         sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get =
629             nvme_feature_invalid_cb;
630 }
631
632 static void
633 pci_nvme_aer_init(struct pci_nvme_softc *sc)
634 {
635
636         STAILQ_INIT(&sc->aer_list);
637         sc->aer_count = 0;
638 }
639
640 static void
641 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
642 {
643         struct pci_nvme_aer *aer = NULL;
644
645         while (!STAILQ_EMPTY(&sc->aer_list)) {
646                 aer = STAILQ_FIRST(&sc->aer_list);
647                 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
648                 free(aer);
649         }
650
651         pci_nvme_aer_init(sc);
652 }
653
654 static bool
655 pci_nvme_aer_available(struct pci_nvme_softc *sc)
656 {
657
658         return (!STAILQ_EMPTY(&sc->aer_list));
659 }
660
661 static bool
662 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
663 {
664         struct nvme_controller_data *cd = &sc->ctrldata;
665
666         /* AERL is a zero based value while aer_count is one's based */
667         return (sc->aer_count == (cd->aerl + 1));
668 }
669
670 /*
671  * Add an Async Event Request
672  *
673  * Stores an AER to be returned later if the Controller needs to notify the
674  * host of an event.
675  * Note that while the NVMe spec doesn't require Controllers to return AER's
676  * in order, this implementation does preserve the order.
677  */
678 static int
679 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
680 {
681         struct pci_nvme_aer *aer = NULL;
682
683         if (pci_nvme_aer_limit_reached(sc))
684                 return (-1);
685
686         aer = calloc(1, sizeof(struct pci_nvme_aer));
687         if (aer == NULL)
688                 return (-1);
689
690         sc->aer_count++;
691
692         /* Save the Command ID for use in the completion message */
693         aer->cid = cid;
694         STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
695
696         return (0);
697 }
698
699 /*
700  * Get an Async Event Request structure
701  *
702  * Returns a pointer to an AER previously submitted by the host or NULL if
703  * no AER's exist. Caller is responsible for freeing the returned struct.
704  */
705 static struct pci_nvme_aer *
706 pci_nvme_aer_get(struct pci_nvme_softc *sc)
707 {
708         struct pci_nvme_aer *aer = NULL;
709
710         aer = STAILQ_FIRST(&sc->aer_list);
711         if (aer != NULL) {
712                 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
713                 sc->aer_count--;
714         }
715         
716         return (aer);
717 }
718
719 static void
720 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
721 {
722         uint32_t i;
723
724         DPRINTF("%s", __func__);
725
726         sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
727             (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
728             (60 << NVME_CAP_LO_REG_TO_SHIFT);
729
730         sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
731
732         sc->regs.vs = 0x00010300;       /* NVMe v1.3 */
733
734         sc->regs.cc = 0;
735         sc->regs.csts = 0;
736
737         assert(sc->submit_queues != NULL);
738
739         for (i = 0; i < sc->num_squeues + 1; i++) {
740                 sc->submit_queues[i].qbase = NULL;
741                 sc->submit_queues[i].size = 0;
742                 sc->submit_queues[i].cqid = 0;
743                 sc->submit_queues[i].tail = 0;
744                 sc->submit_queues[i].head = 0;
745         }
746
747         assert(sc->compl_queues != NULL);
748
749         for (i = 0; i < sc->num_cqueues + 1; i++) {
750                 sc->compl_queues[i].qbase = NULL;
751                 sc->compl_queues[i].size = 0;
752                 sc->compl_queues[i].tail = 0;
753                 sc->compl_queues[i].head = 0;
754         }
755
756         sc->num_q_is_set = false;
757
758         pci_nvme_aer_destroy(sc);
759 }
760
761 static void
762 pci_nvme_reset(struct pci_nvme_softc *sc)
763 {
764         pthread_mutex_lock(&sc->mtx);
765         pci_nvme_reset_locked(sc);
766         pthread_mutex_unlock(&sc->mtx);
767 }
768
769 static void
770 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
771 {
772         uint16_t acqs, asqs;
773
774         DPRINTF("%s", __func__);
775
776         asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
777         sc->submit_queues[0].size = asqs;
778         sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
779                     sizeof(struct nvme_command) * asqs);
780
781         DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
782                 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
783
784         acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 
785             NVME_AQA_REG_ACQS_MASK) + 1;
786         sc->compl_queues[0].size = acqs;
787         sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
788                  sizeof(struct nvme_completion) * acqs);
789         sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
790
791         DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
792                 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
793 }
794
795 static int
796 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
797         size_t len, enum nvme_copy_dir dir)
798 {
799         uint8_t *p;
800         size_t bytes;
801
802         if (len > (8 * 1024)) {
803                 return (-1);
804         }
805
806         /* Copy from the start of prp1 to the end of the physical page */
807         bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
808         bytes = MIN(bytes, len);
809
810         p = vm_map_gpa(ctx, prp1, bytes);
811         if (p == NULL) {
812                 return (-1);
813         }
814
815         if (dir == NVME_COPY_TO_PRP)
816                 memcpy(p, b, bytes);
817         else
818                 memcpy(b, p, bytes);
819
820         b += bytes;
821
822         len -= bytes;
823         if (len == 0) {
824                 return (0);
825         }
826
827         len = MIN(len, PAGE_SIZE);
828
829         p = vm_map_gpa(ctx, prp2, len);
830         if (p == NULL) {
831                 return (-1);
832         }
833
834         if (dir == NVME_COPY_TO_PRP)
835                 memcpy(p, b, len);
836         else
837                 memcpy(b, p, len);
838
839         return (0);
840 }
841
842 /*
843  * Write a Completion Queue Entry update
844  *
845  * Write the completion and update the doorbell value
846  */
847 static void
848 pci_nvme_cq_update(struct pci_nvme_softc *sc,
849                 struct nvme_completion_queue *cq,
850                 uint32_t cdw0,
851                 uint16_t cid,
852                 uint16_t sqid,
853                 uint16_t status)
854 {
855         struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
856         struct nvme_completion *cqe;
857
858         assert(cq->qbase != NULL);
859
860         pthread_mutex_lock(&cq->mtx);
861
862         cqe = &cq->qbase[cq->tail];
863
864         /* Flip the phase bit */
865         status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
866
867         cqe->cdw0 = cdw0;
868         cqe->sqhd = sq->head;
869         cqe->sqid = sqid;
870         cqe->cid = cid;
871         cqe->status = status;
872
873         cq->tail++;
874         if (cq->tail >= cq->size) {
875                 cq->tail = 0;
876         }
877
878         pthread_mutex_unlock(&cq->mtx);
879 }
880
881 static int
882 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
883         struct nvme_completion* compl)
884 {
885         uint16_t qid = command->cdw10 & 0xffff;
886
887         DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
888         if (qid == 0 || qid > sc->num_squeues ||
889             (sc->submit_queues[qid].qbase == NULL)) {
890                 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
891                         __func__, qid, sc->num_squeues);
892                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
893                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
894                 return (1);
895         }
896
897         sc->submit_queues[qid].qbase = NULL;
898         sc->submit_queues[qid].cqid = 0;
899         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
900         return (1);
901 }
902
903 static int
904 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
905         struct nvme_completion* compl)
906 {
907         if (command->cdw11 & NVME_CMD_CDW11_PC) {
908                 uint16_t qid = command->cdw10 & 0xffff;
909                 struct nvme_submission_queue *nsq;
910
911                 if ((qid == 0) || (qid > sc->num_squeues) ||
912                     (sc->submit_queues[qid].qbase != NULL)) {
913                         WPRINTF("%s queue index %u > num_squeues %u",
914                                 __func__, qid, sc->num_squeues);
915                         pci_nvme_status_tc(&compl->status,
916                             NVME_SCT_COMMAND_SPECIFIC,
917                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
918                         return (1);
919                 }
920
921                 nsq = &sc->submit_queues[qid];
922                 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
923                 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
924                 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
925                         /*
926                          * Queues must specify at least two entries
927                          * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
928                          * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
929                          */
930                         pci_nvme_status_tc(&compl->status,
931                             NVME_SCT_COMMAND_SPECIFIC,
932                             NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
933                         return (1);
934                 }
935                 nsq->head = nsq->tail = 0;
936
937                 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
938                 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
939                         pci_nvme_status_tc(&compl->status,
940                             NVME_SCT_COMMAND_SPECIFIC,
941                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
942                         return (1);
943                 }
944
945                 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
946                         pci_nvme_status_tc(&compl->status,
947                             NVME_SCT_COMMAND_SPECIFIC,
948                             NVME_SC_COMPLETION_QUEUE_INVALID);
949                         return (1);
950                 }
951
952                 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
953
954                 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
955                               sizeof(struct nvme_command) * (size_t)nsq->size);
956
957                 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
958                         qid, nsq->size, nsq->qbase, nsq->cqid);
959
960                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
961
962                 DPRINTF("%s completed creating IOSQ qid %u",
963                          __func__, qid);
964         } else {
965                 /* 
966                  * Guest sent non-cont submission queue request.
967                  * This setting is unsupported by this emulation.
968                  */
969                 WPRINTF("%s unsupported non-contig (list-based) "
970                          "create i/o submission queue", __func__);
971
972                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
973         }
974         return (1);
975 }
976
977 static int
978 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
979         struct nvme_completion* compl)
980 {
981         uint16_t qid = command->cdw10 & 0xffff;
982         uint16_t sqid;
983
984         DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
985         if (qid == 0 || qid > sc->num_cqueues ||
986             (sc->compl_queues[qid].qbase == NULL)) {
987                 WPRINTF("%s queue index %u / num_cqueues %u",
988                         __func__, qid, sc->num_cqueues);
989                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
990                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
991                 return (1);
992         }
993
994         /* Deleting an Active CQ is an error */
995         for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
996                 if (sc->submit_queues[sqid].cqid == qid) {
997                         pci_nvme_status_tc(&compl->status,
998                             NVME_SCT_COMMAND_SPECIFIC,
999                             NVME_SC_INVALID_QUEUE_DELETION);
1000                         return (1);
1001                 }
1002
1003         sc->compl_queues[qid].qbase = NULL;
1004         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1005         return (1);
1006 }
1007
1008 static int
1009 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1010         struct nvme_completion* compl)
1011 {
1012         struct nvme_completion_queue *ncq;
1013         uint16_t qid = command->cdw10 & 0xffff;
1014
1015         /* Only support Physically Contiguous queues */
1016         if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1017                 WPRINTF("%s unsupported non-contig (list-based) "
1018                          "create i/o completion queue",
1019                          __func__);
1020
1021                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1022                 return (1);
1023         }
1024
1025         if ((qid == 0) || (qid > sc->num_cqueues) ||
1026             (sc->compl_queues[qid].qbase != NULL)) {
1027                 WPRINTF("%s queue index %u > num_cqueues %u",
1028                         __func__, qid, sc->num_cqueues);
1029                 pci_nvme_status_tc(&compl->status,
1030                     NVME_SCT_COMMAND_SPECIFIC,
1031                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
1032                 return (1);
1033         }
1034
1035         ncq = &sc->compl_queues[qid];
1036         ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1037         ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1038         if (ncq->intr_vec > (sc->max_queues + 1)) {
1039                 pci_nvme_status_tc(&compl->status,
1040                     NVME_SCT_COMMAND_SPECIFIC,
1041                     NVME_SC_INVALID_INTERRUPT_VECTOR);
1042                 return (1);
1043         }
1044
1045         ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1046         if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1047                 /*
1048                  * Queues must specify at least two entries
1049                  * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1050                  * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1051                  */
1052                 pci_nvme_status_tc(&compl->status,
1053                     NVME_SCT_COMMAND_SPECIFIC,
1054                     NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1055                 return (1);
1056         }
1057         ncq->head = ncq->tail = 0;
1058         ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1059                      command->prp1,
1060                      sizeof(struct nvme_command) * (size_t)ncq->size);
1061
1062         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1063
1064
1065         return (1);
1066 }
1067
1068 static int
1069 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1070         struct nvme_completion* compl)
1071 {
1072         uint32_t logsize;
1073         uint8_t logpage = command->cdw10 & 0xFF;
1074
1075         DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1076
1077         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1078
1079         /*
1080          * Command specifies the number of dwords to return in fields NUMDU
1081          * and NUMDL. This is a zero-based value.
1082          */
1083         logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1084         logsize *= sizeof(uint32_t);
1085
1086         switch (logpage) {
1087         case NVME_LOG_ERROR:
1088                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1089                     command->prp2, (uint8_t *)&sc->err_log,
1090                     MIN(logsize, sizeof(sc->err_log)),
1091                     NVME_COPY_TO_PRP);
1092                 break;
1093         case NVME_LOG_HEALTH_INFORMATION:
1094                 pthread_mutex_lock(&sc->mtx);
1095                 memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1096                     sizeof(sc->health_log.data_units_read));
1097                 memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1098                     sizeof(sc->health_log.data_units_written));
1099                 memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1100                     sizeof(sc->health_log.host_read_commands));
1101                 memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1102                     sizeof(sc->health_log.host_write_commands));
1103                 pthread_mutex_unlock(&sc->mtx);
1104
1105                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1106                     command->prp2, (uint8_t *)&sc->health_log,
1107                     MIN(logsize, sizeof(sc->health_log)),
1108                     NVME_COPY_TO_PRP);
1109                 break;
1110         case NVME_LOG_FIRMWARE_SLOT:
1111                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1112                     command->prp2, (uint8_t *)&sc->fw_log,
1113                     MIN(logsize, sizeof(sc->fw_log)),
1114                     NVME_COPY_TO_PRP);
1115                 break;
1116         default:
1117                 DPRINTF("%s get log page %x command not supported",
1118                         __func__, logpage);
1119
1120                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1121                     NVME_SC_INVALID_LOG_PAGE);
1122         }
1123
1124         return (1);
1125 }
1126
1127 static int
1128 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1129         struct nvme_completion* compl)
1130 {
1131         void *dest;
1132         uint16_t status;
1133
1134         DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1135                 command->cdw10 & 0xFF, command->nsid);
1136
1137         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1138
1139         switch (command->cdw10 & 0xFF) {
1140         case 0x00: /* return Identify Namespace data structure */
1141                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1142                     command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1143                     NVME_COPY_TO_PRP);
1144                 break;
1145         case 0x01: /* return Identify Controller data structure */
1146                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1147                     command->prp2, (uint8_t *)&sc->ctrldata,
1148                     sizeof(sc->ctrldata),
1149                     NVME_COPY_TO_PRP);
1150                 break;
1151         case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1152                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1153                                   sizeof(uint32_t) * 1024);
1154                 /* All unused entries shall be zero */
1155                 bzero(dest, sizeof(uint32_t) * 1024);
1156                 ((uint32_t *)dest)[0] = 1;
1157                 break;
1158         case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1159                 if (command->nsid != 1) {
1160                         pci_nvme_status_genc(&status,
1161                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1162                         break;
1163                 }
1164                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1165                                   sizeof(uint32_t) * 1024);
1166                 /* All bytes after the descriptor shall be zero */
1167                 bzero(dest, sizeof(uint32_t) * 1024);
1168
1169                 /* Return NIDT=1 (i.e. EUI64) descriptor */
1170                 ((uint8_t *)dest)[0] = 1;
1171                 ((uint8_t *)dest)[1] = sizeof(uint64_t);
1172                 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1173                 break;
1174         default:
1175                 DPRINTF("%s unsupported identify command requested 0x%x",
1176                          __func__, command->cdw10 & 0xFF);
1177                 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1178                 break;
1179         }
1180
1181         compl->status = status;
1182         return (1);
1183 }
1184
1185 static const char *
1186 nvme_fid_to_name(uint8_t fid)
1187 {
1188         const char *name;
1189
1190         switch (fid) {
1191         case NVME_FEAT_ARBITRATION:
1192                 name = "Arbitration";
1193                 break;
1194         case NVME_FEAT_POWER_MANAGEMENT:
1195                 name = "Power Management";
1196                 break;
1197         case NVME_FEAT_LBA_RANGE_TYPE:
1198                 name = "LBA Range Type";
1199                 break;
1200         case NVME_FEAT_TEMPERATURE_THRESHOLD:
1201                 name = "Temperature Threshold";
1202                 break;
1203         case NVME_FEAT_ERROR_RECOVERY:
1204                 name = "Error Recovery";
1205                 break;
1206         case NVME_FEAT_VOLATILE_WRITE_CACHE:
1207                 name = "Volatile Write Cache";
1208                 break;
1209         case NVME_FEAT_NUMBER_OF_QUEUES:
1210                 name = "Number of Queues";
1211                 break;
1212         case NVME_FEAT_INTERRUPT_COALESCING:
1213                 name = "Interrupt Coalescing";
1214                 break;
1215         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1216                 name = "Interrupt Vector Configuration";
1217                 break;
1218         case NVME_FEAT_WRITE_ATOMICITY:
1219                 name = "Write Atomicity Normal";
1220                 break;
1221         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1222                 name = "Asynchronous Event Configuration";
1223                 break;
1224         case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1225                 name = "Autonomous Power State Transition";
1226                 break;
1227         case NVME_FEAT_HOST_MEMORY_BUFFER:
1228                 name = "Host Memory Buffer";
1229                 break;
1230         case NVME_FEAT_TIMESTAMP:
1231                 name = "Timestamp";
1232                 break;
1233         case NVME_FEAT_KEEP_ALIVE_TIMER:
1234                 name = "Keep Alive Timer";
1235                 break;
1236         case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1237                 name = "Host Controlled Thermal Management";
1238                 break;
1239         case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1240                 name = "Non-Operation Power State Config";
1241                 break;
1242         case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1243                 name = "Read Recovery Level Config";
1244                 break;
1245         case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1246                 name = "Predictable Latency Mode Config";
1247                 break;
1248         case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1249                 name = "Predictable Latency Mode Window";
1250                 break;
1251         case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1252                 name = "LBA Status Information Report Interval";
1253                 break;
1254         case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1255                 name = "Host Behavior Support";
1256                 break;
1257         case NVME_FEAT_SANITIZE_CONFIG:
1258                 name = "Sanitize Config";
1259                 break;
1260         case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1261                 name = "Endurance Group Event Configuration";
1262                 break;
1263         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1264                 name = "Software Progress Marker";
1265                 break;
1266         case NVME_FEAT_HOST_IDENTIFIER:
1267                 name = "Host Identifier";
1268                 break;
1269         case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1270                 name = "Reservation Notification Mask";
1271                 break;
1272         case NVME_FEAT_RESERVATION_PERSISTENCE:
1273                 name = "Reservation Persistence";
1274                 break;
1275         case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1276                 name = "Namespace Write Protection Config";
1277                 break;
1278         default:
1279                 name = "Unknown";
1280                 break;
1281         }
1282
1283         return (name);
1284 }
1285
1286 static void
1287 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1288     struct nvme_feature_obj *feat,
1289     struct nvme_command *command,
1290     struct nvme_completion *compl)
1291 {
1292
1293         pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1294 }
1295
1296 static void
1297 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1298     struct nvme_feature_obj *feat,
1299     struct nvme_command *command,
1300     struct nvme_completion *compl)
1301 {
1302         uint32_t i;
1303         uint32_t cdw11 = command->cdw11;
1304         uint16_t iv;
1305         bool cd;
1306
1307         pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1308
1309         iv = cdw11 & 0xffff;
1310         cd = cdw11 & (1 << 16);
1311
1312         if (iv > (sc->max_queues + 1)) {
1313                 return;
1314         }
1315
1316         /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1317         if ((iv == 0) && !cd)
1318                 return;
1319
1320         /* Requested Interrupt Vector must be used by a CQ */
1321         for (i = 0; i < sc->num_cqueues + 1; i++) {
1322                 if (sc->compl_queues[i].intr_vec == iv) {
1323                         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1324                 }
1325         }
1326
1327 }
1328
1329 static void
1330 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1331     struct nvme_feature_obj *feat,
1332     struct nvme_command *command,
1333     struct nvme_completion *compl)
1334 {
1335         uint16_t nqr;   /* Number of Queues Requested */
1336
1337         if (sc->num_q_is_set) {
1338                 WPRINTF("%s: Number of Queues already set", __func__);
1339                 pci_nvme_status_genc(&compl->status,
1340                     NVME_SC_COMMAND_SEQUENCE_ERROR);
1341                 return;
1342         }
1343
1344         nqr = command->cdw11 & 0xFFFF;
1345         if (nqr == 0xffff) {
1346                 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1347                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1348                 return;
1349         }
1350
1351         sc->num_squeues = ONE_BASED(nqr);
1352         if (sc->num_squeues > sc->max_queues) {
1353                 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1354                                         sc->max_queues);
1355                 sc->num_squeues = sc->max_queues;
1356         }
1357
1358         nqr = (command->cdw11 >> 16) & 0xFFFF;
1359         if (nqr == 0xffff) {
1360                 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1361                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1362                 return;
1363         }
1364
1365         sc->num_cqueues = ONE_BASED(nqr);
1366         if (sc->num_cqueues > sc->max_queues) {
1367                 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1368                                         sc->max_queues);
1369                 sc->num_cqueues = sc->max_queues;
1370         }
1371
1372         /* Patch the command value which will be saved on callback's return */
1373         command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1374         compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1375
1376         sc->num_q_is_set = true;
1377 }
1378
1379 static int
1380 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1381         struct nvme_completion *compl)
1382 {
1383         struct nvme_feature_obj *feat;
1384         uint32_t nsid = command->nsid;
1385         uint8_t fid = command->cdw10 & 0xFF;
1386
1387         DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1388
1389         if (fid >= NVME_FID_MAX) {
1390                 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1391                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1392                 return (1);
1393         }
1394         feat = &sc->feat[fid];
1395
1396         if (!feat->namespace_specific &&
1397             !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1398                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1399                     NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1400                 return (1);
1401         }
1402
1403         compl->cdw0 = 0;
1404         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1405
1406         if (feat->set)
1407                 feat->set(sc, feat, command, compl);
1408
1409         if (compl->status == NVME_SC_SUCCESS)
1410                 feat->cdw11 = command->cdw11;
1411
1412         return (0);
1413 }
1414
1415 static int
1416 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1417         struct nvme_completion* compl)
1418 {
1419         struct nvme_feature_obj *feat;
1420         uint8_t fid = command->cdw10 & 0xFF;
1421
1422         DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1423
1424         if (fid >= NVME_FID_MAX) {
1425                 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1426                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1427                 return (1);
1428         }
1429
1430         compl->cdw0 = 0;
1431         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1432
1433         feat = &sc->feat[fid];
1434         if (feat->get) {
1435                 feat->get(sc, feat, command, compl);
1436         }
1437
1438         if (compl->status == NVME_SC_SUCCESS) {
1439                 compl->cdw0 = feat->cdw11;
1440         }
1441
1442         return (0);
1443 }
1444
1445 static int
1446 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1447         struct nvme_completion* compl)
1448 {
1449         uint8_t ses, lbaf, pi;
1450
1451         /* Only supports Secure Erase Setting - User Data Erase */
1452         ses = (command->cdw10 >> 9) & 0x7;
1453         if (ses > 0x1) {
1454                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1455                 return (1);
1456         }
1457
1458         /* Only supports a single LBA Format */
1459         lbaf = command->cdw10 & 0xf;
1460         if (lbaf != 0) {
1461                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1462                     NVME_SC_INVALID_FORMAT);
1463                 return (1);
1464         }
1465
1466         /* Doesn't support Protection Infomation */
1467         pi = (command->cdw10 >> 5) & 0x7;
1468         if (pi != 0) {
1469                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1470                 return (1);
1471         }
1472
1473         if (sc->nvstore.type == NVME_STOR_RAM) {
1474                 if (sc->nvstore.ctx)
1475                         free(sc->nvstore.ctx);
1476                 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1477                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1478         } else {
1479                 struct pci_nvme_ioreq *req;
1480                 int err;
1481
1482                 req = pci_nvme_get_ioreq(sc);
1483                 if (req == NULL) {
1484                         pci_nvme_status_genc(&compl->status,
1485                             NVME_SC_INTERNAL_DEVICE_ERROR);
1486                         WPRINTF("%s: unable to allocate IO req", __func__);
1487                         return (1);
1488                 }
1489                 req->nvme_sq = &sc->submit_queues[0];
1490                 req->sqid = 0;
1491                 req->opc = command->opc;
1492                 req->cid = command->cid;
1493                 req->nsid = command->nsid;
1494
1495                 req->io_req.br_offset = 0;
1496                 req->io_req.br_resid = sc->nvstore.size;
1497                 req->io_req.br_callback = pci_nvme_io_done;
1498
1499                 err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1500                 if (err) {
1501                         pci_nvme_status_genc(&compl->status,
1502                             NVME_SC_INTERNAL_DEVICE_ERROR);
1503                         pci_nvme_release_ioreq(sc, req);
1504                 }
1505         }
1506
1507         return (1);
1508 }
1509
1510 static int
1511 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1512         struct nvme_completion* compl)
1513 {
1514         DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1515                 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1516
1517         /* TODO: search for the command ID and abort it */
1518
1519         compl->cdw0 = 1;
1520         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1521         return (1);
1522 }
1523
1524 static int
1525 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1526         struct nvme_command* command, struct nvme_completion* compl)
1527 {
1528         DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1529
1530         /* Don't exceed the Async Event Request Limit (AERL). */
1531         if (pci_nvme_aer_limit_reached(sc)) {
1532                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1533                                 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1534                 return (1);
1535         }
1536
1537         if (pci_nvme_aer_add(sc, command->cid)) {
1538                 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1539                                 NVME_SC_INTERNAL_DEVICE_ERROR);
1540                 return (1);
1541         }
1542
1543         /*
1544          * Raise events when they happen based on the Set Features cmd.
1545          * These events happen async, so only set completion successful if
1546          * there is an event reflective of the request to get event.
1547          */
1548         compl->status = NVME_NO_STATUS;
1549
1550         return (0);
1551 }
1552
1553 static void
1554 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1555 {
1556         struct nvme_completion compl;
1557         struct nvme_command *cmd;
1558         struct nvme_submission_queue *sq;
1559         struct nvme_completion_queue *cq;
1560         uint16_t sqhead;
1561
1562         DPRINTF("%s index %u", __func__, (uint32_t)value);
1563
1564         sq = &sc->submit_queues[0];
1565         cq = &sc->compl_queues[0];
1566
1567         pthread_mutex_lock(&sq->mtx);
1568
1569         sqhead = sq->head;
1570         DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1571         
1572         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1573                 cmd = &(sq->qbase)[sqhead];
1574                 compl.cdw0 = 0;
1575                 compl.status = 0;
1576
1577                 switch (cmd->opc) {
1578                 case NVME_OPC_DELETE_IO_SQ:
1579                         DPRINTF("%s command DELETE_IO_SQ", __func__);
1580                         nvme_opc_delete_io_sq(sc, cmd, &compl);
1581                         break;
1582                 case NVME_OPC_CREATE_IO_SQ:
1583                         DPRINTF("%s command CREATE_IO_SQ", __func__);
1584                         nvme_opc_create_io_sq(sc, cmd, &compl);
1585                         break;
1586                 case NVME_OPC_DELETE_IO_CQ:
1587                         DPRINTF("%s command DELETE_IO_CQ", __func__);
1588                         nvme_opc_delete_io_cq(sc, cmd, &compl);
1589                         break;
1590                 case NVME_OPC_CREATE_IO_CQ:
1591                         DPRINTF("%s command CREATE_IO_CQ", __func__);
1592                         nvme_opc_create_io_cq(sc, cmd, &compl);
1593                         break;
1594                 case NVME_OPC_GET_LOG_PAGE:
1595                         DPRINTF("%s command GET_LOG_PAGE", __func__);
1596                         nvme_opc_get_log_page(sc, cmd, &compl);
1597                         break;
1598                 case NVME_OPC_IDENTIFY:
1599                         DPRINTF("%s command IDENTIFY", __func__);
1600                         nvme_opc_identify(sc, cmd, &compl);
1601                         break;
1602                 case NVME_OPC_ABORT:
1603                         DPRINTF("%s command ABORT", __func__);
1604                         nvme_opc_abort(sc, cmd, &compl);
1605                         break;
1606                 case NVME_OPC_SET_FEATURES:
1607                         DPRINTF("%s command SET_FEATURES", __func__);
1608                         nvme_opc_set_features(sc, cmd, &compl);
1609                         break;
1610                 case NVME_OPC_GET_FEATURES:
1611                         DPRINTF("%s command GET_FEATURES", __func__);
1612                         nvme_opc_get_features(sc, cmd, &compl);
1613                         break;
1614                 case NVME_OPC_FIRMWARE_ACTIVATE:
1615                         DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
1616                         pci_nvme_status_tc(&compl.status,
1617                             NVME_SCT_COMMAND_SPECIFIC,
1618                             NVME_SC_INVALID_FIRMWARE_SLOT);
1619                         break;
1620                 case NVME_OPC_ASYNC_EVENT_REQUEST:
1621                         DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1622                         nvme_opc_async_event_req(sc, cmd, &compl);
1623                         break;
1624                 case NVME_OPC_FORMAT_NVM:
1625                         DPRINTF("%s command FORMAT_NVM", __func__);
1626                         if ((sc->ctrldata.oacs &
1627                             (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1628                                 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1629                         }
1630                         compl.status = NVME_NO_STATUS;
1631                         nvme_opc_format_nvm(sc, cmd, &compl);
1632                         break;
1633                 default:
1634                         DPRINTF("0x%x command is not implemented",
1635                             cmd->opc);
1636                         pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1637                 }
1638                 sqhead = (sqhead + 1) % sq->size;
1639
1640                 if (NVME_COMPLETION_VALID(compl)) {
1641                         pci_nvme_cq_update(sc, &sc->compl_queues[0],
1642                             compl.cdw0,
1643                             cmd->cid,
1644                             0,          /* SQID */
1645                             compl.status);
1646                 }
1647         }
1648
1649         DPRINTF("setting sqhead %u", sqhead);
1650         sq->head = sqhead;
1651
1652         if (cq->head != cq->tail)
1653                 pci_generate_msix(sc->nsc_pi, 0);
1654
1655         pthread_mutex_unlock(&sq->mtx);
1656 }
1657
1658 /*
1659  * Update the Write and Read statistics reported in SMART data
1660  *
1661  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
1662  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
1663  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
1664  */
1665 static void
1666 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
1667     size_t bytes, uint16_t status)
1668 {
1669
1670         pthread_mutex_lock(&sc->mtx);
1671         switch (opc) {
1672         case NVME_OPC_WRITE:
1673                 sc->write_commands++;
1674                 if (status != NVME_SC_SUCCESS)
1675                         break;
1676                 sc->write_dunits_remainder += (bytes / 512);
1677                 while (sc->write_dunits_remainder >= 1000) {
1678                         sc->write_data_units++;
1679                         sc->write_dunits_remainder -= 1000;
1680                 }
1681                 break;
1682         case NVME_OPC_READ:
1683                 sc->read_commands++;
1684                 if (status != NVME_SC_SUCCESS)
1685                         break;
1686                 sc->read_dunits_remainder += (bytes / 512);
1687                 while (sc->read_dunits_remainder >= 1000) {
1688                         sc->read_data_units++;
1689                         sc->read_dunits_remainder -= 1000;
1690                 }
1691                 break;
1692         default:
1693                 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
1694                 break;
1695         }
1696         pthread_mutex_unlock(&sc->mtx);
1697 }
1698
1699 /*
1700  * Check if the combination of Starting LBA (slba) and Number of Logical
1701  * Blocks (nlb) exceeds the range of the underlying storage.
1702  *
1703  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
1704  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
1705  * overflow.
1706  */
1707 static bool
1708 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
1709     uint32_t nlb)
1710 {
1711         size_t  offset, bytes;
1712
1713         /* Overflow check of multiplying Starting LBA by the sector size */
1714         if (slba >> (64 - nvstore->sectsz_bits))
1715                 return (true);
1716
1717         offset = slba << nvstore->sectsz_bits;
1718         bytes = nlb << nvstore->sectsz_bits;
1719
1720         /* Overflow check of Number of Logical Blocks */
1721         if ((nvstore->size - offset) < bytes)
1722                 return (true);
1723
1724         return (false);
1725 }
1726
1727 static int
1728 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1729         uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1730 {
1731         int iovidx;
1732
1733         if (req == NULL)
1734                 return (-1);
1735
1736         if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
1737                 return (-1);
1738         }
1739
1740         /* concatenate contig block-iovs to minimize number of iovs */
1741         if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1742                 iovidx = req->io_req.br_iovcnt - 1;
1743
1744                 req->io_req.br_iov[iovidx].iov_base =
1745                     paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1746                                      req->prev_gpaddr, size);
1747
1748                 req->prev_size += size;
1749                 req->io_req.br_resid += size;
1750
1751                 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1752         } else {
1753                 iovidx = req->io_req.br_iovcnt;
1754                 if (iovidx == 0) {
1755                         req->io_req.br_offset = lba;
1756                         req->io_req.br_resid = 0;
1757                         req->io_req.br_param = req;
1758                 }
1759
1760                 req->io_req.br_iov[iovidx].iov_base =
1761                     paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1762                                      gpaddr, size);
1763
1764                 req->io_req.br_iov[iovidx].iov_len = size;
1765
1766                 req->prev_gpaddr = gpaddr;
1767                 req->prev_size = size;
1768                 req->io_req.br_resid += size;
1769
1770                 req->io_req.br_iovcnt++;
1771         }
1772
1773         return (0);
1774 }
1775
1776 static void
1777 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1778         struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1779         uint32_t cdw0, uint16_t status)
1780 {
1781         struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1782
1783         DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1784                  __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1785                  NVME_STATUS_GET_SC(status));
1786
1787         pci_nvme_cq_update(sc, cq,
1788             0,          /* CDW0 */
1789             cid,
1790             sqid,
1791             status);
1792
1793         if (cq->head != cq->tail) {
1794                 if (cq->intr_en & NVME_CQ_INTEN) {
1795                         pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1796                 } else {
1797                         DPRINTF("%s: CQ%u interrupt disabled",
1798                                                 __func__, sq->cqid);
1799                 }
1800         }
1801 }
1802
1803 static void
1804 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1805 {
1806         req->sc = NULL;
1807         req->nvme_sq = NULL;
1808         req->sqid = 0;
1809
1810         pthread_mutex_lock(&sc->mtx);
1811
1812         STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1813         sc->pending_ios--;
1814
1815         /* when no more IO pending, can set to ready if device reset/enabled */
1816         if (sc->pending_ios == 0 &&
1817             NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1818                 sc->regs.csts |= NVME_CSTS_RDY;
1819
1820         pthread_mutex_unlock(&sc->mtx);
1821
1822         sem_post(&sc->iosemlock);
1823 }
1824
1825 static struct pci_nvme_ioreq *
1826 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1827 {
1828         struct pci_nvme_ioreq *req = NULL;;
1829
1830         sem_wait(&sc->iosemlock);
1831         pthread_mutex_lock(&sc->mtx);
1832
1833         req = STAILQ_FIRST(&sc->ioreqs_free);
1834         assert(req != NULL);
1835         STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1836
1837         req->sc = sc;
1838
1839         sc->pending_ios++;
1840
1841         pthread_mutex_unlock(&sc->mtx);
1842
1843         req->io_req.br_iovcnt = 0;
1844         req->io_req.br_offset = 0;
1845         req->io_req.br_resid = 0;
1846         req->io_req.br_param = req;
1847         req->prev_gpaddr = 0;
1848         req->prev_size = 0;
1849
1850         return req;
1851 }
1852
1853 static void
1854 pci_nvme_io_done(struct blockif_req *br, int err)
1855 {
1856         struct pci_nvme_ioreq *req = br->br_param;
1857         struct nvme_submission_queue *sq = req->nvme_sq;
1858         uint16_t code, status;
1859
1860         DPRINTF("%s error %d %s", __func__, err, strerror(err));
1861
1862         /* TODO return correct error */
1863         code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1864         pci_nvme_status_genc(&status, code);
1865
1866         pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1867         pci_nvme_stats_write_read_update(req->sc, req->opc,
1868             req->bytes, status);
1869         pci_nvme_release_ioreq(req->sc, req);
1870 }
1871
1872 /*
1873  * Implements the Flush command. The specification states:
1874  *    If a volatile write cache is not present, Flush commands complete
1875  *    successfully and have no effect
1876  * in the description of the Volatile Write Cache (VWC) field of the Identify
1877  * Controller data. Therefore, set status to Success if the command is
1878  * not supported (i.e. RAM or as indicated by the blockif).
1879  */
1880 static bool
1881 nvme_opc_flush(struct pci_nvme_softc *sc,
1882     struct nvme_command *cmd,
1883     struct pci_nvme_blockstore *nvstore,
1884     struct pci_nvme_ioreq *req,
1885     uint16_t *status)
1886 {
1887         bool pending = false;
1888
1889         if (nvstore->type == NVME_STOR_RAM) {
1890                 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1891         } else {
1892                 int err;
1893
1894                 req->io_req.br_callback = pci_nvme_io_done;
1895
1896                 err = blockif_flush(nvstore->ctx, &req->io_req);
1897                 switch (err) {
1898                 case 0:
1899                         pending = true;
1900                         break;
1901                 case EOPNOTSUPP:
1902                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1903                         break;
1904                 default:
1905                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1906                 }
1907         }
1908
1909         return (pending);
1910 }
1911
1912 static uint16_t
1913 nvme_write_read_ram(struct pci_nvme_softc *sc,
1914     struct pci_nvme_blockstore *nvstore,
1915     uint64_t prp1, uint64_t prp2,
1916     size_t offset, uint64_t bytes,
1917     bool is_write)
1918 {
1919         uint8_t *buf = nvstore->ctx;
1920         enum nvme_copy_dir dir;
1921         uint16_t status;
1922
1923         if (is_write)
1924                 dir = NVME_COPY_TO_PRP;
1925         else
1926                 dir = NVME_COPY_FROM_PRP;
1927
1928         if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
1929             buf + offset, bytes, dir))
1930                 pci_nvme_status_genc(&status,
1931                     NVME_SC_DATA_TRANSFER_ERROR);
1932         else
1933                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1934
1935         return (status);
1936 }
1937
1938 static uint16_t
1939 nvme_write_read_blockif(struct pci_nvme_softc *sc,
1940     struct pci_nvme_blockstore *nvstore,
1941     struct pci_nvme_ioreq *req,
1942     uint64_t prp1, uint64_t prp2,
1943     size_t offset, uint64_t bytes,
1944     bool is_write)
1945 {
1946         uint64_t size;
1947         int err;
1948         uint16_t status = NVME_NO_STATUS;
1949
1950         size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
1951         if (pci_nvme_append_iov_req(sc, req, prp1,
1952             size, is_write, offset)) {
1953                 pci_nvme_status_genc(&status,
1954                     NVME_SC_DATA_TRANSFER_ERROR);
1955                 goto out;
1956         }
1957
1958         offset += size;
1959         bytes  -= size;
1960
1961         if (bytes == 0) {
1962                 ;
1963         } else if (bytes <= PAGE_SIZE) {
1964                 size = bytes;
1965                 if (pci_nvme_append_iov_req(sc, req, prp2,
1966                     size, is_write, offset)) {
1967                         pci_nvme_status_genc(&status,
1968                             NVME_SC_DATA_TRANSFER_ERROR);
1969                         goto out;
1970                 }
1971         } else {
1972                 void *vmctx = sc->nsc_pi->pi_vmctx;
1973                 uint64_t *prp_list = &prp2;
1974                 uint64_t *last = prp_list;
1975
1976                 /* PRP2 is pointer to a physical region page list */
1977                 while (bytes) {
1978                         /* Last entry in list points to the next list */
1979                         if (prp_list == last) {
1980                                 uint64_t prp = *prp_list;
1981
1982                                 prp_list = paddr_guest2host(vmctx, prp,
1983                                     PAGE_SIZE - (prp % PAGE_SIZE));
1984                                 last = prp_list + (NVME_PRP2_ITEMS - 1);
1985                         }
1986
1987                         size = MIN(bytes, PAGE_SIZE);
1988
1989                         if (pci_nvme_append_iov_req(sc, req, *prp_list,
1990                             size, is_write, offset)) {
1991                                 pci_nvme_status_genc(&status,
1992                                     NVME_SC_DATA_TRANSFER_ERROR);
1993                                 goto out;
1994                         }
1995
1996                         offset += size;
1997                         bytes  -= size;
1998
1999                         prp_list++;
2000                 }
2001         }
2002         req->io_req.br_callback = pci_nvme_io_done;
2003         if (is_write)
2004                 err = blockif_write(nvstore->ctx, &req->io_req);
2005         else
2006                 err = blockif_read(nvstore->ctx, &req->io_req);
2007
2008         if (err)
2009                 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2010 out:
2011         return (status);
2012 }
2013
2014 static bool
2015 nvme_opc_write_read(struct pci_nvme_softc *sc,
2016     struct nvme_command *cmd,
2017     struct pci_nvme_blockstore *nvstore,
2018     struct pci_nvme_ioreq *req,
2019     uint16_t *status)
2020 {
2021         uint64_t lba, nblocks, bytes;
2022         size_t offset;
2023         bool is_write = cmd->opc == NVME_OPC_WRITE;
2024         bool pending = false;
2025
2026         lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2027         nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2028         if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2029                 WPRINTF("%s command would exceed LBA range", __func__);
2030                 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2031                 goto out;
2032         }
2033
2034         bytes  = nblocks << nvstore->sectsz_bits;
2035         if (bytes > NVME_MAX_DATA_SIZE) {
2036                 WPRINTF("%s command would exceed MDTS", __func__);
2037                 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2038                 goto out;
2039         }
2040
2041         offset = lba << nvstore->sectsz_bits;
2042
2043         req->bytes = bytes;
2044         req->io_req.br_offset = lba;
2045
2046         /* PRP bits 1:0 must be zero */
2047         cmd->prp1 &= ~0x3UL;
2048         cmd->prp2 &= ~0x3UL;
2049
2050         if (nvstore->type == NVME_STOR_RAM) {
2051                 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2052                     cmd->prp2, offset, bytes, is_write);
2053         } else {
2054                 *status = nvme_write_read_blockif(sc, nvstore, req,
2055                     cmd->prp1, cmd->prp2, offset, bytes, is_write);
2056
2057                 if (*status == NVME_NO_STATUS)
2058                         pending = true;
2059         }
2060 out:
2061         if (!pending)
2062                 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2063
2064         return (pending);
2065 }
2066
2067 static void
2068 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2069 {
2070         struct pci_nvme_ioreq *req = br->br_param;
2071         struct pci_nvme_softc *sc = req->sc;
2072         bool done = true;
2073         uint16_t status;
2074
2075         if (err) {
2076                 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2077         } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2078                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2079         } else {
2080                 struct iovec *iov = req->io_req.br_iov;
2081
2082                 req->prev_gpaddr++;
2083                 iov += req->prev_gpaddr;
2084
2085                 /* The iov_* values already include the sector size */
2086                 req->io_req.br_offset = (off_t)iov->iov_base;
2087                 req->io_req.br_resid = iov->iov_len;
2088                 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2089                         pci_nvme_status_genc(&status,
2090                             NVME_SC_INTERNAL_DEVICE_ERROR);
2091                 } else
2092                         done = false;
2093         }
2094
2095         if (done) {
2096                 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2097                     req->cid, 0, status);
2098                 pci_nvme_release_ioreq(sc, req);
2099         }
2100 }
2101
2102 static bool
2103 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2104     struct nvme_command *cmd,
2105     struct pci_nvme_blockstore *nvstore,
2106     struct pci_nvme_ioreq *req,
2107     uint16_t *status)
2108 {
2109         struct nvme_dsm_range *range;
2110         uint32_t nr, r, non_zero, dr;
2111         int err;
2112         bool pending = false;
2113
2114         if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2115                 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2116                 goto out;
2117         }
2118
2119         nr = cmd->cdw10 & 0xff;
2120
2121         /* copy locally because a range entry could straddle PRPs */
2122         range = calloc(1, NVME_MAX_DSM_TRIM);
2123         if (range == NULL) {
2124                 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2125                 goto out;
2126         }
2127         nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2128             (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2129
2130         /* Check for invalid ranges and the number of non-zero lengths */
2131         non_zero = 0;
2132         for (r = 0; r <= nr; r++) {
2133                 if (pci_nvme_out_of_range(nvstore,
2134                     range[r].starting_lba, range[r].length)) {
2135                         pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2136                         goto out;
2137                 }
2138                 if (range[r].length != 0)
2139                         non_zero++;
2140         }
2141
2142         if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2143                 size_t offset, bytes;
2144                 int sectsz_bits = sc->nvstore.sectsz_bits;
2145
2146                 /*
2147                  * DSM calls are advisory only, and compliant controllers
2148                  * may choose to take no actions (i.e. return Success).
2149                  */
2150                 if (!nvstore->deallocate) {
2151                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2152                         goto out;
2153                 }
2154
2155                 /* If all ranges have a zero length, return Success */
2156                 if (non_zero == 0) {
2157                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2158                         goto out;
2159                 }
2160
2161                 if (req == NULL) {
2162                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2163                         goto out;
2164                 }
2165
2166                 offset = range[0].starting_lba << sectsz_bits;
2167                 bytes = range[0].length << sectsz_bits;
2168
2169                 /*
2170                  * If the request is for more than a single range, store
2171                  * the ranges in the br_iov. Optimize for the common case
2172                  * of a single range.
2173                  *
2174                  * Note that NVMe Number of Ranges is a zero based value
2175                  */
2176                 req->io_req.br_iovcnt = 0;
2177                 req->io_req.br_offset = offset;
2178                 req->io_req.br_resid = bytes;
2179
2180                 if (nr == 0) {
2181                         req->io_req.br_callback = pci_nvme_io_done;
2182                 } else {
2183                         struct iovec *iov = req->io_req.br_iov;
2184
2185                         for (r = 0, dr = 0; r <= nr; r++) {
2186                                 offset = range[r].starting_lba << sectsz_bits;
2187                                 bytes = range[r].length << sectsz_bits;
2188                                 if (bytes == 0)
2189                                         continue;
2190
2191                                 if ((nvstore->size - offset) < bytes) {
2192                                         pci_nvme_status_genc(status,
2193                                             NVME_SC_LBA_OUT_OF_RANGE);
2194                                         goto out;
2195                                 }
2196                                 iov[dr].iov_base = (void *)offset;
2197                                 iov[dr].iov_len = bytes;
2198                                 dr++;
2199                         }
2200                         req->io_req.br_callback = pci_nvme_dealloc_sm;
2201
2202                         /*
2203                          * Use prev_gpaddr to track the current entry and
2204                          * prev_size to track the number of entries
2205                          */
2206                         req->prev_gpaddr = 0;
2207                         req->prev_size = dr;
2208                 }
2209
2210                 err = blockif_delete(nvstore->ctx, &req->io_req);
2211                 if (err)
2212                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2213                 else
2214                         pending = true;
2215         }
2216 out:
2217         free(range);
2218         return (pending);
2219 }
2220
2221 static void
2222 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2223 {
2224         struct nvme_submission_queue *sq;
2225         uint16_t status;
2226         uint16_t sqhead;
2227
2228         /* handle all submissions up to sq->tail index */
2229         sq = &sc->submit_queues[idx];
2230
2231         pthread_mutex_lock(&sq->mtx);
2232
2233         sqhead = sq->head;
2234         DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2235                  idx, sqhead, sq->tail, sq->qbase);
2236
2237         while (sqhead != atomic_load_acq_short(&sq->tail)) {
2238                 struct nvme_command *cmd;
2239                 struct pci_nvme_ioreq *req;
2240                 uint32_t nsid;
2241                 bool pending;
2242
2243                 pending = false;
2244                 req = NULL;
2245                 status = 0;
2246
2247                 cmd = &sq->qbase[sqhead];
2248                 sqhead = (sqhead + 1) % sq->size;
2249
2250                 nsid = le32toh(cmd->nsid);
2251                 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2252                         pci_nvme_status_genc(&status,
2253                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2254                         status |=
2255                             NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2256                         goto complete;
2257                 }
2258
2259                 req = pci_nvme_get_ioreq(sc);
2260                 if (req == NULL) {
2261                         pci_nvme_status_genc(&status,
2262                             NVME_SC_INTERNAL_DEVICE_ERROR);
2263                         WPRINTF("%s: unable to allocate IO req", __func__);
2264                         goto complete;
2265                 }
2266                 req->nvme_sq = sq;
2267                 req->sqid = idx;
2268                 req->opc = cmd->opc;
2269                 req->cid = cmd->cid;
2270                 req->nsid = cmd->nsid;
2271
2272                 switch (cmd->opc) {
2273                 case NVME_OPC_FLUSH:
2274                         pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2275                             req, &status);
2276                         break;
2277                 case NVME_OPC_WRITE:
2278                 case NVME_OPC_READ:
2279                         pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2280                             req, &status);
2281                         break;
2282                 case NVME_OPC_WRITE_ZEROES:
2283                         /* TODO: write zeroes
2284                         WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2285                                 __func__, lba, cmd->cdw12 & 0xFFFF); */
2286                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2287                         break;
2288                 case NVME_OPC_DATASET_MANAGEMENT:
2289                         pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2290                             req, &status);
2291                         break;
2292                 default:
2293                         WPRINTF("%s unhandled io command 0x%x",
2294                             __func__, cmd->opc);
2295                         pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2296                 }
2297 complete:
2298                 if (!pending) {
2299                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2300                             status);
2301                         if (req != NULL)
2302                                 pci_nvme_release_ioreq(sc, req);
2303                 }
2304         }
2305
2306         sq->head = sqhead;
2307
2308         pthread_mutex_unlock(&sq->mtx);
2309 }
2310
2311 static void
2312 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2313         uint64_t idx, int is_sq, uint64_t value)
2314 {
2315         DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2316                 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2317
2318         if (is_sq) {
2319                 if (idx > sc->num_squeues) {
2320                         WPRINTF("%s queue index %lu overflow from "
2321                                  "guest (max %u)",
2322                                  __func__, idx, sc->num_squeues);
2323                         return;
2324                 }
2325
2326                 atomic_store_short(&sc->submit_queues[idx].tail,
2327                                    (uint16_t)value);
2328
2329                 if (idx == 0) {
2330                         pci_nvme_handle_admin_cmd(sc, value);
2331                 } else {
2332                         /* submission queue; handle new entries in SQ */
2333                         if (idx > sc->num_squeues) {
2334                                 WPRINTF("%s SQ index %lu overflow from "
2335                                          "guest (max %u)",
2336                                          __func__, idx, sc->num_squeues);
2337                                 return;
2338                         }
2339                         pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2340                 }
2341         } else {
2342                 if (idx > sc->num_cqueues) {
2343                         WPRINTF("%s queue index %lu overflow from "
2344                                  "guest (max %u)",
2345                                  __func__, idx, sc->num_cqueues);
2346                         return;
2347                 }
2348
2349                 atomic_store_short(&sc->compl_queues[idx].head,
2350                                 (uint16_t)value);
2351         }
2352 }
2353
2354 static void
2355 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2356 {
2357         const char *s = iswrite ? "WRITE" : "READ";
2358
2359         switch (offset) {
2360         case NVME_CR_CAP_LOW:
2361                 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2362                 break;
2363         case NVME_CR_CAP_HI:
2364                 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2365                 break;
2366         case NVME_CR_VS:
2367                 DPRINTF("%s %s NVME_CR_VS", func, s);
2368                 break;
2369         case NVME_CR_INTMS:
2370                 DPRINTF("%s %s NVME_CR_INTMS", func, s);
2371                 break;
2372         case NVME_CR_INTMC:
2373                 DPRINTF("%s %s NVME_CR_INTMC", func, s);
2374                 break;
2375         case NVME_CR_CC:
2376                 DPRINTF("%s %s NVME_CR_CC", func, s);
2377                 break;
2378         case NVME_CR_CSTS:
2379                 DPRINTF("%s %s NVME_CR_CSTS", func, s);
2380                 break;
2381         case NVME_CR_NSSR:
2382                 DPRINTF("%s %s NVME_CR_NSSR", func, s);
2383                 break;
2384         case NVME_CR_AQA:
2385                 DPRINTF("%s %s NVME_CR_AQA", func, s);
2386                 break;
2387         case NVME_CR_ASQ_LOW:
2388                 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2389                 break;
2390         case NVME_CR_ASQ_HI:
2391                 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2392                 break;
2393         case NVME_CR_ACQ_LOW:
2394                 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2395                 break;
2396         case NVME_CR_ACQ_HI:
2397                 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2398                 break;
2399         default:
2400                 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2401         }
2402
2403 }
2404
2405 static void
2406 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2407         uint64_t offset, int size, uint64_t value)
2408 {
2409         uint32_t ccreg;
2410
2411         if (offset >= NVME_DOORBELL_OFFSET) {
2412                 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2413                 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2414                 int is_sq = (belloffset % 8) < 4;
2415
2416                 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2417                         WPRINTF("guest attempted an overflow write offset "
2418                                  "0x%lx, val 0x%lx in %s",
2419                                  offset, value, __func__);
2420                         return;
2421                 }
2422
2423                 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2424                 return;
2425         }
2426
2427         DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2428                 offset, size, value);
2429
2430         if (size != 4) {
2431                 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2432                          "val 0x%lx) to bar0 in %s",
2433                          size, offset, value, __func__);
2434                 /* TODO: shutdown device */
2435                 return;
2436         }
2437
2438         pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2439
2440         pthread_mutex_lock(&sc->mtx);
2441
2442         switch (offset) {
2443         case NVME_CR_CAP_LOW:
2444         case NVME_CR_CAP_HI:
2445                 /* readonly */
2446                 break;
2447         case NVME_CR_VS:
2448                 /* readonly */
2449                 break;
2450         case NVME_CR_INTMS:
2451                 /* MSI-X, so ignore */
2452                 break;
2453         case NVME_CR_INTMC:
2454                 /* MSI-X, so ignore */
2455                 break;
2456         case NVME_CR_CC:
2457                 ccreg = (uint32_t)value;
2458
2459                 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2460                          "iocqes %u",
2461                         __func__,
2462                          NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2463                          NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2464                          NVME_CC_GET_IOCQES(ccreg));
2465
2466                 if (NVME_CC_GET_SHN(ccreg)) {
2467                         /* perform shutdown - flush out data to backend */
2468                         sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2469                             NVME_CSTS_REG_SHST_SHIFT);
2470                         sc->regs.csts |= NVME_SHST_COMPLETE <<
2471                             NVME_CSTS_REG_SHST_SHIFT;
2472                 }
2473                 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2474                         if (NVME_CC_GET_EN(ccreg) == 0)
2475                                 /* transition 1-> causes controller reset */
2476                                 pci_nvme_reset_locked(sc);
2477                         else
2478                                 pci_nvme_init_controller(ctx, sc);
2479                 }
2480
2481                 /* Insert the iocqes, iosqes and en bits from the write */
2482                 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2483                 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2484                 if (NVME_CC_GET_EN(ccreg) == 0) {
2485                         /* Insert the ams, mps and css bit fields */
2486                         sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2487                         sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2488                         sc->regs.csts &= ~NVME_CSTS_RDY;
2489                 } else if (sc->pending_ios == 0) {
2490                         sc->regs.csts |= NVME_CSTS_RDY;
2491                 }
2492                 break;
2493         case NVME_CR_CSTS:
2494                 break;
2495         case NVME_CR_NSSR:
2496                 /* ignore writes; don't support subsystem reset */
2497                 break;
2498         case NVME_CR_AQA:
2499                 sc->regs.aqa = (uint32_t)value;
2500                 break;
2501         case NVME_CR_ASQ_LOW:
2502                 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2503                                (0xFFFFF000 & value);
2504                 break;
2505         case NVME_CR_ASQ_HI:
2506                 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2507                                (value << 32);
2508                 break;
2509         case NVME_CR_ACQ_LOW:
2510                 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2511                                (0xFFFFF000 & value);
2512                 break;
2513         case NVME_CR_ACQ_HI:
2514                 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2515                                (value << 32);
2516                 break;
2517         default:
2518                 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2519                          __func__, offset, value, size);
2520         }
2521         pthread_mutex_unlock(&sc->mtx);
2522 }
2523
2524 static void
2525 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2526                 int baridx, uint64_t offset, int size, uint64_t value)
2527 {
2528         struct pci_nvme_softc* sc = pi->pi_arg;
2529
2530         if (baridx == pci_msix_table_bar(pi) ||
2531             baridx == pci_msix_pba_bar(pi)) {
2532                 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2533                          " value 0x%lx", baridx, offset, size, value);
2534
2535                 pci_emul_msix_twrite(pi, offset, size, value);
2536                 return;
2537         }
2538
2539         switch (baridx) {
2540         case 0:
2541                 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2542                 break;
2543
2544         default:
2545                 DPRINTF("%s unknown baridx %d, val 0x%lx",
2546                          __func__, baridx, value);
2547         }
2548 }
2549
2550 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2551         uint64_t offset, int size)
2552 {
2553         uint64_t value;
2554
2555         pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2556
2557         if (offset < NVME_DOORBELL_OFFSET) {
2558                 void *p = &(sc->regs);
2559                 pthread_mutex_lock(&sc->mtx);
2560                 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2561                 pthread_mutex_unlock(&sc->mtx);
2562         } else {
2563                 value = 0;
2564                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2565         }
2566
2567         switch (size) {
2568         case 1:
2569                 value &= 0xFF;
2570                 break;
2571         case 2:
2572                 value &= 0xFFFF;
2573                 break;
2574         case 4:
2575                 value &= 0xFFFFFFFF;
2576                 break;
2577         }
2578
2579         DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
2580                  offset, size, (uint32_t)value);
2581
2582         return (value);
2583 }
2584
2585
2586
2587 static uint64_t
2588 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2589     uint64_t offset, int size)
2590 {
2591         struct pci_nvme_softc* sc = pi->pi_arg;
2592
2593         if (baridx == pci_msix_table_bar(pi) ||
2594             baridx == pci_msix_pba_bar(pi)) {
2595                 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2596                         baridx, offset, size);
2597
2598                 return pci_emul_msix_tread(pi, offset, size);
2599         }
2600
2601         switch (baridx) {
2602         case 0:
2603                 return pci_nvme_read_bar_0(sc, offset, size);
2604
2605         default:
2606                 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2607         }
2608
2609         return (0);
2610 }
2611
2612
2613 static int
2614 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2615 {
2616         char bident[sizeof("XX:X:X")];
2617         char    *uopt, *xopts, *config;
2618         uint32_t sectsz;
2619         int optidx;
2620
2621         sc->max_queues = NVME_QUEUES;
2622         sc->max_qentries = NVME_MAX_QENTRIES;
2623         sc->ioslots = NVME_IOSLOTS;
2624         sc->num_squeues = sc->max_queues;
2625         sc->num_cqueues = sc->max_queues;
2626         sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2627         sectsz = 0;
2628
2629         uopt = strdup(opts);
2630         optidx = 0;
2631         snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2632                  "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2633         for (xopts = strtok(uopt, ",");
2634              xopts != NULL;
2635              xopts = strtok(NULL, ",")) {
2636
2637                 if ((config = strchr(xopts, '=')) != NULL)
2638                         *config++ = '\0';
2639
2640                 if (!strcmp("maxq", xopts)) {
2641                         sc->max_queues = atoi(config);
2642                 } else if (!strcmp("qsz", xopts)) {
2643                         sc->max_qentries = atoi(config);
2644                 } else if (!strcmp("ioslots", xopts)) {
2645                         sc->ioslots = atoi(config);
2646                 } else if (!strcmp("sectsz", xopts)) {
2647                         sectsz = atoi(config);
2648                 } else if (!strcmp("ser", xopts)) {
2649                         /*
2650                          * This field indicates the Product Serial Number in
2651                          * 7-bit ASCII, unused bytes should be space characters.
2652                          * Ref: NVMe v1.3c.
2653                          */
2654                         cpywithpad((char *)sc->ctrldata.sn,
2655                                    sizeof(sc->ctrldata.sn), config, ' ');
2656                 } else if (!strcmp("ram", xopts)) {
2657                         uint64_t sz = strtoull(&xopts[4], NULL, 10);
2658
2659                         sc->nvstore.type = NVME_STOR_RAM;
2660                         sc->nvstore.size = sz * 1024 * 1024;
2661                         sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2662                         sc->nvstore.sectsz = 4096;
2663                         sc->nvstore.sectsz_bits = 12;
2664                         if (sc->nvstore.ctx == NULL) {
2665                                 perror("Unable to allocate RAM");
2666                                 free(uopt);
2667                                 return (-1);
2668                         }
2669                 } else if (!strcmp("eui64", xopts)) {
2670                         sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2671                 } else if (!strcmp("dsm", xopts)) {
2672                         if (!strcmp("auto", config))
2673                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2674                         else if (!strcmp("enable", config))
2675                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2676                         else if (!strcmp("disable", config))
2677                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2678                 } else if (optidx == 0) {
2679                         snprintf(bident, sizeof(bident), "%d:%d",
2680                                  sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2681                         sc->nvstore.ctx = blockif_open(xopts, bident);
2682                         if (sc->nvstore.ctx == NULL) {
2683                                 perror("Could not open backing file");
2684                                 free(uopt);
2685                                 return (-1);
2686                         }
2687                         sc->nvstore.type = NVME_STOR_BLOCKIF;
2688                         sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2689                 } else {
2690                         EPRINTLN("Invalid option %s", xopts);
2691                         free(uopt);
2692                         return (-1);
2693                 }
2694
2695                 optidx++;
2696         }
2697         free(uopt);
2698
2699         if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2700                 EPRINTLN("backing store not specified");
2701                 return (-1);
2702         }
2703         if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2704                 sc->nvstore.sectsz = sectsz;
2705         else if (sc->nvstore.type != NVME_STOR_RAM)
2706                 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2707         for (sc->nvstore.sectsz_bits = 9;
2708              (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2709              sc->nvstore.sectsz_bits++);
2710
2711         if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2712                 sc->max_queues = NVME_QUEUES;
2713
2714         if (sc->max_qentries <= 0) {
2715                 EPRINTLN("Invalid qsz option");
2716                 return (-1);
2717         }
2718         if (sc->ioslots <= 0) {
2719                 EPRINTLN("Invalid ioslots option");
2720                 return (-1);
2721         }
2722
2723         return (0);
2724 }
2725
2726 static int
2727 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2728 {
2729         struct pci_nvme_softc *sc;
2730         uint32_t pci_membar_sz;
2731         int     error;
2732
2733         error = 0;
2734
2735         sc = calloc(1, sizeof(struct pci_nvme_softc));
2736         pi->pi_arg = sc;
2737         sc->nsc_pi = pi;
2738
2739         error = pci_nvme_parse_opts(sc, opts);
2740         if (error < 0)
2741                 goto done;
2742         else
2743                 error = 0;
2744
2745         STAILQ_INIT(&sc->ioreqs_free);
2746         sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2747         for (int i = 0; i < sc->ioslots; i++) {
2748                 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2749         }
2750
2751         pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2752         pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2753         pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2754         pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2755         pci_set_cfgdata8(pi, PCIR_PROGIF,
2756                          PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2757
2758         /*
2759          * Allocate size of NVMe registers + doorbell space for all queues.
2760          *
2761          * The specification requires a minimum memory I/O window size of 16K.
2762          * The Windows driver will refuse to start a device with a smaller
2763          * window.
2764          */
2765         pci_membar_sz = sizeof(struct nvme_registers) +
2766             2 * sizeof(uint32_t) * (sc->max_queues + 1);
2767         pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2768
2769         DPRINTF("nvme membar size: %u", pci_membar_sz);
2770
2771         error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2772         if (error) {
2773                 WPRINTF("%s pci alloc mem bar failed", __func__);
2774                 goto done;
2775         }
2776
2777         error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2778         if (error) {
2779                 WPRINTF("%s pci add msixcap failed", __func__);
2780                 goto done;
2781         }
2782
2783         error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2784         if (error) {
2785                 WPRINTF("%s pci add Express capability failed", __func__);
2786                 goto done;
2787         }
2788
2789         pthread_mutex_init(&sc->mtx, NULL);
2790         sem_init(&sc->iosemlock, 0, sc->ioslots);
2791
2792         pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2793         /*
2794          * Controller data depends on Namespace data so initialize Namespace
2795          * data first.
2796          */
2797         pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2798         pci_nvme_init_ctrldata(sc);
2799         pci_nvme_init_logpages(sc);
2800         pci_nvme_init_features(sc);
2801
2802         pci_nvme_aer_init(sc);
2803
2804         pci_nvme_reset(sc);
2805
2806         pci_lintr_request(pi);
2807
2808 done:
2809         return (error);
2810 }
2811
2812
2813 struct pci_devemu pci_de_nvme = {
2814         .pe_emu =       "nvme",
2815         .pe_init =      pci_nvme_init,
2816         .pe_barwrite =  pci_nvme_write,
2817         .pe_barread =   pci_nvme_read
2818 };
2819 PCI_EMUL_SET(pci_de_nvme);