]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/bhyve/pci_nvme.c
bhyve: add locks around NVMe queue accesses
[FreeBSD/FreeBSD.git] / usr.sbin / bhyve / pci_nvme.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov 
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65
66 #include <assert.h>
67 #include <pthread.h>
68 #include <semaphore.h>
69 #include <stdbool.h>
70 #include <stddef.h>
71 #include <stdint.h>
72 #include <stdio.h>
73 #include <stdlib.h>
74 #include <string.h>
75
76 #include <machine/atomic.h>
77 #include <machine/vmm.h>
78 #include <vmmapi.h>
79
80 #include <dev/nvme/nvme.h>
81
82 #include "bhyverun.h"
83 #include "block_if.h"
84 #include "debug.h"
85 #include "pci_emul.h"
86
87
88 static int nvme_debug = 0;
89 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
90 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
91
92 /* defaults; can be overridden */
93 #define NVME_MSIX_BAR           4
94
95 #define NVME_IOSLOTS            8
96
97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
98 #define NVME_MMIO_SPACE_MIN     (1 << 14)
99
100 #define NVME_QUEUES             16
101 #define NVME_MAX_QENTRIES       2048
102
103 #define NVME_PRP2_ITEMS         (PAGE_SIZE/sizeof(uint64_t))
104 #define NVME_MAX_BLOCKIOVS      512
105
106 /* This is a synthetic status code to indicate there is no status */
107 #define NVME_NO_STATUS          0xffff
108 #define NVME_COMPLETION_VALID(c)        ((c).status != NVME_NO_STATUS)
109
110 /* helpers */
111
112 /* Convert a zero-based value into a one-based value */
113 #define ONE_BASED(zero)         ((zero) + 1)
114 /* Convert a one-based value into a zero-based value */
115 #define ZERO_BASED(one)         ((one)  - 1)
116
117 /* Encode number of SQ's and CQ's for Set/Get Features */
118 #define NVME_FEATURE_NUM_QUEUES(sc) \
119         (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
120         (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
121
122 #define NVME_DOORBELL_OFFSET    offsetof(struct nvme_registers, doorbell)
123
124 enum nvme_controller_register_offsets {
125         NVME_CR_CAP_LOW = 0x00,
126         NVME_CR_CAP_HI  = 0x04,
127         NVME_CR_VS      = 0x08,
128         NVME_CR_INTMS   = 0x0c,
129         NVME_CR_INTMC   = 0x10,
130         NVME_CR_CC      = 0x14,
131         NVME_CR_CSTS    = 0x1c,
132         NVME_CR_NSSR    = 0x20,
133         NVME_CR_AQA     = 0x24,
134         NVME_CR_ASQ_LOW = 0x28,
135         NVME_CR_ASQ_HI  = 0x2c,
136         NVME_CR_ACQ_LOW = 0x30,
137         NVME_CR_ACQ_HI  = 0x34,
138 };
139
140 enum nvme_cmd_cdw11 {
141         NVME_CMD_CDW11_PC  = 0x0001,
142         NVME_CMD_CDW11_IEN = 0x0002,
143         NVME_CMD_CDW11_IV  = 0xFFFF0000,
144 };
145
146 enum nvme_copy_dir {
147         NVME_COPY_TO_PRP,
148         NVME_COPY_FROM_PRP,
149 };
150
151 #define NVME_CQ_INTEN   0x01
152 #define NVME_CQ_INTCOAL 0x02
153
154 struct nvme_completion_queue {
155         struct nvme_completion *qbase;
156         pthread_mutex_t mtx;
157         uint32_t        size;
158         uint16_t        tail; /* nvme progress */
159         uint16_t        head; /* guest progress */
160         uint16_t        intr_vec;
161         uint32_t        intr_en;
162 };
163
164 struct nvme_submission_queue {
165         struct nvme_command *qbase;
166         pthread_mutex_t mtx;
167         uint32_t        size;
168         uint16_t        head; /* nvme progress */
169         uint16_t        tail; /* guest progress */
170         uint16_t        cqid; /* completion queue id */
171         int             qpriority;
172 };
173
174 enum nvme_storage_type {
175         NVME_STOR_BLOCKIF = 0,
176         NVME_STOR_RAM = 1,
177 };
178
179 struct pci_nvme_blockstore {
180         enum nvme_storage_type type;
181         void            *ctx;
182         uint64_t        size;
183         uint32_t        sectsz;
184         uint32_t        sectsz_bits;
185         uint64_t        eui64;
186         uint32_t        deallocate:1;
187 };
188
189 struct pci_nvme_ioreq {
190         struct pci_nvme_softc *sc;
191         STAILQ_ENTRY(pci_nvme_ioreq) link;
192         struct nvme_submission_queue *nvme_sq;
193         uint16_t        sqid;
194
195         /* command information */
196         uint16_t        opc;
197         uint16_t        cid;
198         uint32_t        nsid;
199
200         uint64_t        prev_gpaddr;
201         size_t          prev_size;
202
203         /*
204          * lock if all iovs consumed (big IO);
205          * complete transaction before continuing
206          */
207         pthread_mutex_t mtx;
208         pthread_cond_t  cv;
209
210         struct blockif_req io_req;
211
212         /* pad to fit up to 512 page descriptors from guest IO request */
213         struct iovec    iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
214 };
215
216 enum nvme_dsm_type {
217         /* Dataset Management bit in ONCS reflects backing storage capability */
218         NVME_DATASET_MANAGEMENT_AUTO,
219         /* Unconditionally set Dataset Management bit in ONCS */
220         NVME_DATASET_MANAGEMENT_ENABLE,
221         /* Unconditionally clear Dataset Management bit in ONCS */
222         NVME_DATASET_MANAGEMENT_DISABLE,
223 };
224
225 struct pci_nvme_softc {
226         struct pci_devinst *nsc_pi;
227
228         pthread_mutex_t mtx;
229
230         struct nvme_registers regs;
231
232         struct nvme_namespace_data  nsdata;
233         struct nvme_controller_data ctrldata;
234         struct nvme_error_information_entry err_log;
235         struct nvme_health_information_page health_log;
236         struct nvme_firmware_page fw_log;
237
238         struct pci_nvme_blockstore nvstore;
239
240         uint16_t        max_qentries;   /* max entries per queue */
241         uint32_t        max_queues;     /* max number of IO SQ's or CQ's */
242         uint32_t        num_cqueues;
243         uint32_t        num_squeues;
244
245         struct pci_nvme_ioreq *ioreqs;
246         STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
247         uint32_t        pending_ios;
248         uint32_t        ioslots;
249         sem_t           iosemlock;
250
251         /*
252          * Memory mapped Submission and Completion queues
253          * Each array includes both Admin and IO queues
254          */
255         struct nvme_completion_queue *compl_queues;
256         struct nvme_submission_queue *submit_queues;
257
258         /* controller features */
259         uint32_t        intr_coales_aggr_time;   /* 0x08: uS to delay intr */
260         uint32_t        intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
261         uint32_t        async_ev_config;         /* 0x0B: async event config */
262
263         enum nvme_dsm_type dataset_management;
264 };
265
266
267 static void pci_nvme_io_partial(struct blockif_req *br, int err);
268
269 /* Controller Configuration utils */
270 #define NVME_CC_GET_EN(cc) \
271         ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
272 #define NVME_CC_GET_CSS(cc) \
273         ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
274 #define NVME_CC_GET_SHN(cc) \
275         ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
276 #define NVME_CC_GET_IOSQES(cc) \
277         ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
278 #define NVME_CC_GET_IOCQES(cc) \
279         ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
280
281 #define NVME_CC_WRITE_MASK \
282         ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
283          (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
284          (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
285
286 #define NVME_CC_NEN_WRITE_MASK \
287         ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
288          (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
289          (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
290
291 /* Controller Status utils */
292 #define NVME_CSTS_GET_RDY(sts) \
293         ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
294
295 #define NVME_CSTS_RDY   (1 << NVME_CSTS_REG_RDY_SHIFT)
296
297 /* Completion Queue status word utils */
298 #define NVME_STATUS_P   (1 << NVME_STATUS_P_SHIFT)
299 #define NVME_STATUS_MASK \
300         ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
301          (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
302
303 #define NVME_ONCS_DSM   (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
304         NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
305
306 static __inline void
307 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
308 {
309         size_t len;
310
311         len = strnlen(src, dst_size);
312         memset(dst, pad, dst_size);
313         memcpy(dst, src, len);
314 }
315
316 static __inline void
317 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
318 {
319
320         *status &= ~NVME_STATUS_MASK;
321         *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
322                 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
323 }
324
325 static __inline void
326 pci_nvme_status_genc(uint16_t *status, uint16_t code)
327 {
328
329         pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
330 }
331
332 static __inline void
333 pci_nvme_toggle_phase(uint16_t *status, int prev)
334 {
335
336         if (prev)
337                 *status &= ~NVME_STATUS_P;
338         else
339                 *status |= NVME_STATUS_P;
340 }
341
342 /*
343  * Initialize the requested number or IO Submission and Completion Queues.
344  * Admin queues are allocated implicitly.
345  */
346 static void
347 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
348 {
349         uint32_t i;
350
351         /*
352          * Allocate and initialize the Submission Queues
353          */
354         if (nsq > NVME_QUEUES) {
355                 WPRINTF("%s: clamping number of SQ from %u to %u",
356                                         __func__, nsq, NVME_QUEUES);
357                 nsq = NVME_QUEUES;
358         }
359
360         sc->num_squeues = nsq;
361
362         sc->submit_queues = calloc(sc->num_squeues + 1,
363                                 sizeof(struct nvme_submission_queue));
364         if (sc->submit_queues == NULL) {
365                 WPRINTF("%s: SQ allocation failed", __func__);
366                 sc->num_squeues = 0;
367         } else {
368                 struct nvme_submission_queue *sq = sc->submit_queues;
369
370                 for (i = 0; i < sc->num_squeues; i++)
371                         pthread_mutex_init(&sq[i].mtx, NULL);
372         }
373
374         /*
375          * Allocate and initialize the Completion Queues
376          */
377         if (ncq > NVME_QUEUES) {
378                 WPRINTF("%s: clamping number of CQ from %u to %u",
379                                         __func__, ncq, NVME_QUEUES);
380                 ncq = NVME_QUEUES;
381         }
382
383         sc->num_cqueues = ncq;
384
385         sc->compl_queues = calloc(sc->num_cqueues + 1,
386                                 sizeof(struct nvme_completion_queue));
387         if (sc->compl_queues == NULL) {
388                 WPRINTF("%s: CQ allocation failed", __func__);
389                 sc->num_cqueues = 0;
390         } else {
391                 struct nvme_completion_queue *cq = sc->compl_queues;
392
393                 for (i = 0; i < sc->num_cqueues; i++)
394                         pthread_mutex_init(&cq[i].mtx, NULL);
395         }
396 }
397
398 static void
399 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
400 {
401         struct nvme_controller_data *cd = &sc->ctrldata;
402
403         cd->vid = 0xFB5D;
404         cd->ssvid = 0x0000;
405
406         cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
407         cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
408
409         /* Num of submission commands that we can handle at a time (2^rab) */
410         cd->rab   = 4;
411
412         /* FreeBSD OUI */
413         cd->ieee[0] = 0x58;
414         cd->ieee[1] = 0x9c;
415         cd->ieee[2] = 0xfc;
416
417         cd->mic = 0;
418
419         cd->mdts = 9;   /* max data transfer size (2^mdts * CAP.MPSMIN) */
420
421         cd->ver = 0x00010300;
422
423         cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
424         cd->acl = 2;
425         cd->aerl = 4;
426
427         cd->lpa = 0;    /* TODO: support some simple things like SMART */
428         cd->elpe = 0;   /* max error log page entries */
429         cd->npss = 1;   /* number of power states support */
430
431         /* Warning Composite Temperature Threshold */
432         cd->wctemp = 0x0157;
433
434         cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
435             (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
436         cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
437             (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
438         cd->nn = 1;     /* number of namespaces */
439
440         cd->oncs = 0;
441         switch (sc->dataset_management) {
442         case NVME_DATASET_MANAGEMENT_AUTO:
443                 if (sc->nvstore.deallocate)
444                         cd->oncs |= NVME_ONCS_DSM;
445                 break;
446         case NVME_DATASET_MANAGEMENT_ENABLE:
447                 cd->oncs |= NVME_ONCS_DSM;
448                 break;
449         default:
450                 break;
451         }
452
453         cd->fna = 0x03;
454
455         cd->power_state[0].mp = 10;
456 }
457
458 /*
459  * Calculate the CRC-16 of the given buffer
460  * See copyright attribution at top of file
461  */
462 static uint16_t
463 crc16(uint16_t crc, const void *buffer, unsigned int len)
464 {
465         const unsigned char *cp = buffer;
466         /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
467         static uint16_t const crc16_table[256] = {
468                 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
469                 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
470                 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
471                 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
472                 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
473                 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
474                 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
475                 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
476                 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
477                 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
478                 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
479                 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
480                 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
481                 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
482                 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
483                 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
484                 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
485                 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
486                 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
487                 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
488                 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
489                 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
490                 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
491                 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
492                 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
493                 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
494                 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
495                 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
496                 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
497                 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
498                 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
499                 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
500         };
501
502         while (len--)
503                 crc = (((crc >> 8) & 0xffU) ^
504                     crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
505         return crc;
506 }
507
508 static void
509 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
510     struct nvme_namespace_data *nd, uint32_t nsid,
511     struct pci_nvme_blockstore *nvstore)
512 {
513
514         /* Get capacity and block size information from backing store */
515         nd->nsze = nvstore->size / nvstore->sectsz;
516         nd->ncap = nd->nsze;
517         nd->nuse = nd->nsze;
518
519         if (nvstore->type == NVME_STOR_BLOCKIF)
520                 nvstore->deallocate = blockif_candelete(nvstore->ctx);
521
522         nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
523         nd->flbas = 0;
524
525         /* Create an EUI-64 if user did not provide one */
526         if (nvstore->eui64 == 0) {
527                 char *data = NULL;
528                 uint64_t eui64 = nvstore->eui64;
529
530                 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
531                     sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
532
533                 if (data != NULL) {
534                         eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
535                         free(data);
536                 }
537                 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
538         }
539         be64enc(nd->eui64, nvstore->eui64);
540
541         /* LBA data-sz = 2^lbads */
542         nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
543 }
544
545 static void
546 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
547 {
548
549         memset(&sc->err_log, 0, sizeof(sc->err_log));
550         memset(&sc->health_log, 0, sizeof(sc->health_log));
551         memset(&sc->fw_log, 0, sizeof(sc->fw_log));
552 }
553
554 static void
555 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
556 {
557         uint32_t i;
558
559         DPRINTF("%s", __func__);
560
561         sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
562             (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
563             (60 << NVME_CAP_LO_REG_TO_SHIFT);
564
565         sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
566
567         sc->regs.vs = 0x00010300;       /* NVMe v1.3 */
568
569         sc->regs.cc = 0;
570         sc->regs.csts = 0;
571
572         assert(sc->submit_queues != NULL);
573
574         for (i = 0; i < sc->num_squeues + 1; i++) {
575                 sc->submit_queues[i].qbase = NULL;
576                 sc->submit_queues[i].size = 0;
577                 sc->submit_queues[i].cqid = 0;
578                 sc->submit_queues[i].tail = 0;
579                 sc->submit_queues[i].head = 0;
580         }
581
582         assert(sc->compl_queues != NULL);
583
584         for (i = 0; i < sc->num_cqueues + 1; i++) {
585                 sc->compl_queues[i].qbase = NULL;
586                 sc->compl_queues[i].size = 0;
587                 sc->compl_queues[i].tail = 0;
588                 sc->compl_queues[i].head = 0;
589         }
590 }
591
592 static void
593 pci_nvme_reset(struct pci_nvme_softc *sc)
594 {
595         pthread_mutex_lock(&sc->mtx);
596         pci_nvme_reset_locked(sc);
597         pthread_mutex_unlock(&sc->mtx);
598 }
599
600 static void
601 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
602 {
603         uint16_t acqs, asqs;
604
605         DPRINTF("%s", __func__);
606
607         asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
608         sc->submit_queues[0].size = asqs;
609         sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
610                     sizeof(struct nvme_command) * asqs);
611
612         DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
613                 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
614
615         acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 
616             NVME_AQA_REG_ACQS_MASK) + 1;
617         sc->compl_queues[0].size = acqs;
618         sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
619                  sizeof(struct nvme_completion) * acqs);
620         DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
621                 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
622 }
623
624 static int
625 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
626         size_t len, enum nvme_copy_dir dir)
627 {
628         uint8_t *p;
629         size_t bytes;
630
631         if (len > (8 * 1024)) {
632                 return (-1);
633         }
634
635         /* Copy from the start of prp1 to the end of the physical page */
636         bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
637         bytes = MIN(bytes, len);
638
639         p = vm_map_gpa(ctx, prp1, bytes);
640         if (p == NULL) {
641                 return (-1);
642         }
643
644         if (dir == NVME_COPY_TO_PRP)
645                 memcpy(p, b, bytes);
646         else
647                 memcpy(b, p, bytes);
648
649         b += bytes;
650
651         len -= bytes;
652         if (len == 0) {
653                 return (0);
654         }
655
656         len = MIN(len, PAGE_SIZE);
657
658         p = vm_map_gpa(ctx, prp2, len);
659         if (p == NULL) {
660                 return (-1);
661         }
662
663         if (dir == NVME_COPY_TO_PRP)
664                 memcpy(p, b, len);
665         else
666                 memcpy(b, p, len);
667
668         return (0);
669 }
670
671 static int
672 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
673         struct nvme_completion* compl)
674 {
675         uint16_t qid = command->cdw10 & 0xffff;
676
677         DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
678         if (qid == 0 || qid > sc->num_squeues) {
679                 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
680                         __func__, qid, sc->num_squeues);
681                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
682                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
683                 return (1);
684         }
685
686         sc->submit_queues[qid].qbase = NULL;
687         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
688         return (1);
689 }
690
691 static int
692 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
693         struct nvme_completion* compl)
694 {
695         if (command->cdw11 & NVME_CMD_CDW11_PC) {
696                 uint16_t qid = command->cdw10 & 0xffff;
697                 struct nvme_submission_queue *nsq;
698
699                 if ((qid == 0) || (qid > sc->num_squeues)) {
700                         WPRINTF("%s queue index %u > num_squeues %u",
701                                 __func__, qid, sc->num_squeues);
702                         pci_nvme_status_tc(&compl->status,
703                             NVME_SCT_COMMAND_SPECIFIC,
704                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
705                         return (1);
706                 }
707
708                 nsq = &sc->submit_queues[qid];
709                 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
710
711                 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
712                               sizeof(struct nvme_command) * (size_t)nsq->size);
713                 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
714                 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
715
716                 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
717                         qid, nsq->size, nsq->qbase, nsq->cqid);
718
719                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
720
721                 DPRINTF("%s completed creating IOSQ qid %u",
722                          __func__, qid);
723         } else {
724                 /* 
725                  * Guest sent non-cont submission queue request.
726                  * This setting is unsupported by this emulation.
727                  */
728                 WPRINTF("%s unsupported non-contig (list-based) "
729                          "create i/o submission queue", __func__);
730
731                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
732         }
733         return (1);
734 }
735
736 static int
737 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
738         struct nvme_completion* compl)
739 {
740         uint16_t qid = command->cdw10 & 0xffff;
741
742         DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
743         if (qid == 0 || qid > sc->num_cqueues) {
744                 WPRINTF("%s queue index %u / num_cqueues %u",
745                         __func__, qid, sc->num_cqueues);
746                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
747                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
748                 return (1);
749         }
750
751         sc->compl_queues[qid].qbase = NULL;
752         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
753         return (1);
754 }
755
756 static int
757 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
758         struct nvme_completion* compl)
759 {
760         if (command->cdw11 & NVME_CMD_CDW11_PC) {
761                 uint16_t qid = command->cdw10 & 0xffff;
762                 struct nvme_completion_queue *ncq;
763
764                 if ((qid == 0) || (qid > sc->num_cqueues)) {
765                         WPRINTF("%s queue index %u > num_cqueues %u",
766                                 __func__, qid, sc->num_cqueues);
767                         pci_nvme_status_tc(&compl->status,
768                             NVME_SCT_COMMAND_SPECIFIC,
769                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
770                         return (1);
771                 }
772
773                 ncq = &sc->compl_queues[qid];
774                 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
775                 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
776                 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
777
778                 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
779                              command->prp1,
780                              sizeof(struct nvme_command) * (size_t)ncq->size);
781
782                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
783         } else {
784                 /* 
785                  * Non-contig completion queue unsupported.
786                  */
787                 WPRINTF("%s unsupported non-contig (list-based) "
788                          "create i/o completion queue",
789                          __func__);
790
791                 /* 0x12 = Invalid Use of Controller Memory Buffer */
792                 pci_nvme_status_genc(&compl->status, 0x12);
793         }
794
795         return (1);
796 }
797
798 static int
799 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
800         struct nvme_completion* compl)
801 {
802         uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
803         uint8_t logpage = command->cdw10 & 0xFF;
804
805         DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
806
807         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
808
809         switch (logpage) {
810         case NVME_LOG_ERROR:
811                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
812                     command->prp2, (uint8_t *)&sc->err_log, logsize,
813                     NVME_COPY_TO_PRP);
814                 break;
815         case NVME_LOG_HEALTH_INFORMATION:
816                 /* TODO: present some smart info */
817                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
818                     command->prp2, (uint8_t *)&sc->health_log, logsize,
819                     NVME_COPY_TO_PRP);
820                 break;
821         case NVME_LOG_FIRMWARE_SLOT:
822                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
823                     command->prp2, (uint8_t *)&sc->fw_log, logsize,
824                     NVME_COPY_TO_PRP);
825                 break;
826         default:
827                 WPRINTF("%s get log page %x command not supported",
828                         __func__, logpage);
829
830                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
831                     NVME_SC_INVALID_LOG_PAGE);
832         }
833
834         return (1);
835 }
836
837 static int
838 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
839         struct nvme_completion* compl)
840 {
841         void *dest;
842
843         DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
844                 command->cdw10 & 0xFF, command->nsid);
845
846         switch (command->cdw10 & 0xFF) {
847         case 0x00: /* return Identify Namespace data structure */
848                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
849                     command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
850                     NVME_COPY_TO_PRP);
851                 break;
852         case 0x01: /* return Identify Controller data structure */
853                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
854                     command->prp2, (uint8_t *)&sc->ctrldata,
855                     sizeof(sc->ctrldata),
856                     NVME_COPY_TO_PRP);
857                 break;
858         case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
859                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
860                                   sizeof(uint32_t) * 1024);
861                 ((uint32_t *)dest)[0] = 1;
862                 ((uint32_t *)dest)[1] = 0;
863                 break;
864         case 0x11:
865                 pci_nvme_status_genc(&compl->status,
866                     NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
867                 return (1);
868         case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
869         case 0x10:
870         case 0x12:
871         case 0x13:
872         case 0x14:
873         case 0x15:
874         default:
875                 DPRINTF("%s unsupported identify command requested 0x%x",
876                          __func__, command->cdw10 & 0xFF);
877                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
878                 return (1);
879         }
880
881         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
882         return (1);
883 }
884
885 static int
886 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
887         struct nvme_completion* compl)
888 {
889         uint16_t nqr;   /* Number of Queues Requested */
890
891         nqr = command->cdw11 & 0xFFFF;
892         if (nqr == 0xffff) {
893                 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
894                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
895                 return (-1);
896         }
897
898         sc->num_squeues = ONE_BASED(nqr);
899         if (sc->num_squeues > sc->max_queues) {
900                 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
901                                         sc->max_queues);
902                 sc->num_squeues = sc->max_queues;
903         }
904
905         nqr = (command->cdw11 >> 16) & 0xFFFF;
906         if (nqr == 0xffff) {
907                 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
908                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
909                 return (-1);
910         }
911
912         sc->num_cqueues = ONE_BASED(nqr);
913         if (sc->num_cqueues > sc->max_queues) {
914                 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
915                                         sc->max_queues);
916                 sc->num_cqueues = sc->max_queues;
917         }
918
919         compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
920
921         return (0);
922 }
923
924 static int
925 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
926         struct nvme_completion* compl)
927 {
928         int feature = command->cdw10 & 0xFF;
929         uint32_t iv;
930
931         DPRINTF("%s feature 0x%x", __func__, feature);
932         compl->cdw0 = 0;
933
934         switch (feature) {
935         case NVME_FEAT_ARBITRATION:
936                 DPRINTF("  arbitration 0x%x", command->cdw11);
937                 break;
938         case NVME_FEAT_POWER_MANAGEMENT:
939                 DPRINTF("  power management 0x%x", command->cdw11);
940                 break;
941         case NVME_FEAT_LBA_RANGE_TYPE:
942                 DPRINTF("  lba range 0x%x", command->cdw11);
943                 break;
944         case NVME_FEAT_TEMPERATURE_THRESHOLD:
945                 DPRINTF("  temperature threshold 0x%x", command->cdw11);
946                 break;
947         case NVME_FEAT_ERROR_RECOVERY:
948                 DPRINTF("  error recovery 0x%x", command->cdw11);
949                 break;
950         case NVME_FEAT_VOLATILE_WRITE_CACHE:
951                 DPRINTF("  volatile write cache 0x%x", command->cdw11);
952                 break;
953         case NVME_FEAT_NUMBER_OF_QUEUES:
954                 nvme_set_feature_queues(sc, command, compl);
955                 break;
956         case NVME_FEAT_INTERRUPT_COALESCING:
957                 DPRINTF("  interrupt coalescing 0x%x", command->cdw11);
958
959                 /* in uS */
960                 sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
961
962                 sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
963                 break;
964         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
965                 iv = command->cdw11 & 0xFFFF;
966
967                 DPRINTF("  interrupt vector configuration 0x%x",
968                         command->cdw11);
969
970                 for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
971                         if (sc->compl_queues[i].intr_vec == iv) {
972                                 if (command->cdw11 & (1 << 16))
973                                         sc->compl_queues[i].intr_en |=
974                                                               NVME_CQ_INTCOAL;  
975                                 else
976                                         sc->compl_queues[i].intr_en &=
977                                                              ~NVME_CQ_INTCOAL;  
978                         }
979                 }
980                 break;
981         case NVME_FEAT_WRITE_ATOMICITY:
982                 DPRINTF("  write atomicity 0x%x", command->cdw11);
983                 break;
984         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
985                 DPRINTF("  async event configuration 0x%x",
986                         command->cdw11);
987                 sc->async_ev_config = command->cdw11;
988                 break;
989         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
990                 DPRINTF("  software progress marker 0x%x",
991                         command->cdw11);
992                 break;
993         case 0x0C:
994                 DPRINTF("  autonomous power state transition 0x%x",
995                         command->cdw11);
996                 break;
997         default:
998                 WPRINTF("%s invalid feature", __func__);
999                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1000                 return (1);
1001         }
1002
1003         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1004         return (1);
1005 }
1006
1007 static int
1008 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1009         struct nvme_completion* compl)
1010 {
1011         int feature = command->cdw10 & 0xFF;
1012
1013         DPRINTF("%s feature 0x%x", __func__, feature);
1014
1015         compl->cdw0 = 0;
1016
1017         switch (feature) {
1018         case NVME_FEAT_ARBITRATION:
1019                 DPRINTF("  arbitration");
1020                 break;
1021         case NVME_FEAT_POWER_MANAGEMENT:
1022                 DPRINTF("  power management");
1023                 break;
1024         case NVME_FEAT_LBA_RANGE_TYPE:
1025                 DPRINTF("  lba range");
1026                 break;
1027         case NVME_FEAT_TEMPERATURE_THRESHOLD:
1028                 DPRINTF("  temperature threshold");
1029                 switch ((command->cdw11 >> 20) & 0x3) {
1030                 case 0:
1031                         /* Over temp threshold */
1032                         compl->cdw0 = 0xFFFF;
1033                         break;
1034                 case 1:
1035                         /* Under temp threshold */
1036                         compl->cdw0 = 0;
1037                         break;
1038                 default:
1039                         WPRINTF("  invalid threshold type select");
1040                         pci_nvme_status_genc(&compl->status,
1041                             NVME_SC_INVALID_FIELD);
1042                         return (1);
1043                 }
1044                 break;
1045         case NVME_FEAT_ERROR_RECOVERY:
1046                 DPRINTF("  error recovery");
1047                 break;
1048         case NVME_FEAT_VOLATILE_WRITE_CACHE:
1049                 DPRINTF("  volatile write cache");
1050                 break;
1051         case NVME_FEAT_NUMBER_OF_QUEUES:
1052                 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1053
1054                 DPRINTF("  number of queues (submit %u, completion %u)",
1055                         compl->cdw0 & 0xFFFF,
1056                         (compl->cdw0 >> 16) & 0xFFFF);
1057
1058                 break;
1059         case NVME_FEAT_INTERRUPT_COALESCING:
1060                 DPRINTF("  interrupt coalescing");
1061                 break;
1062         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1063                 DPRINTF("  interrupt vector configuration");
1064                 break;
1065         case NVME_FEAT_WRITE_ATOMICITY:
1066                 DPRINTF("  write atomicity");
1067                 break;
1068         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1069                 DPRINTF("  async event configuration");
1070                 sc->async_ev_config = command->cdw11;
1071                 break;
1072         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1073                 DPRINTF("  software progress marker");
1074                 break;
1075         case 0x0C:
1076                 DPRINTF("  autonomous power state transition");
1077                 break;
1078         default:
1079                 WPRINTF("%s invalid feature 0x%x", __func__, feature);
1080                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1081                 return (1);
1082         }
1083
1084         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1085         return (1);
1086 }
1087
1088 static int
1089 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1090         struct nvme_completion* compl)
1091 {
1092         DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1093                 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1094
1095         /* TODO: search for the command ID and abort it */
1096
1097         compl->cdw0 = 1;
1098         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1099         return (1);
1100 }
1101
1102 static int
1103 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1104         struct nvme_command* command, struct nvme_completion* compl)
1105 {
1106         DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1107
1108         /*
1109          * TODO: raise events when they happen based on the Set Features cmd.
1110          * These events happen async, so only set completion successful if
1111          * there is an event reflective of the request to get event.
1112          */
1113         pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1114             NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1115         return (0);
1116 }
1117
1118 static void
1119 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1120 {
1121         struct nvme_completion compl;
1122         struct nvme_command *cmd;
1123         struct nvme_submission_queue *sq;
1124         struct nvme_completion_queue *cq;
1125         uint16_t sqhead;
1126
1127         DPRINTF("%s index %u", __func__, (uint32_t)value);
1128
1129         sq = &sc->submit_queues[0];
1130         cq = &sc->compl_queues[0];
1131
1132         pthread_mutex_lock(&sq->mtx);
1133
1134         sqhead = sq->head;
1135         DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1136         
1137         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1138                 cmd = &(sq->qbase)[sqhead];
1139                 compl.cdw0 = 0;
1140                 compl.status = 0;
1141
1142                 switch (cmd->opc) {
1143                 case NVME_OPC_DELETE_IO_SQ:
1144                         DPRINTF("%s command DELETE_IO_SQ", __func__);
1145                         nvme_opc_delete_io_sq(sc, cmd, &compl);
1146                         break;
1147                 case NVME_OPC_CREATE_IO_SQ:
1148                         DPRINTF("%s command CREATE_IO_SQ", __func__);
1149                         nvme_opc_create_io_sq(sc, cmd, &compl);
1150                         break;
1151                 case NVME_OPC_DELETE_IO_CQ:
1152                         DPRINTF("%s command DELETE_IO_CQ", __func__);
1153                         nvme_opc_delete_io_cq(sc, cmd, &compl);
1154                         break;
1155                 case NVME_OPC_CREATE_IO_CQ:
1156                         DPRINTF("%s command CREATE_IO_CQ", __func__);
1157                         nvme_opc_create_io_cq(sc, cmd, &compl);
1158                         break;
1159                 case NVME_OPC_GET_LOG_PAGE:
1160                         DPRINTF("%s command GET_LOG_PAGE", __func__);
1161                         nvme_opc_get_log_page(sc, cmd, &compl);
1162                         break;
1163                 case NVME_OPC_IDENTIFY:
1164                         DPRINTF("%s command IDENTIFY", __func__);
1165                         nvme_opc_identify(sc, cmd, &compl);
1166                         break;
1167                 case NVME_OPC_ABORT:
1168                         DPRINTF("%s command ABORT", __func__);
1169                         nvme_opc_abort(sc, cmd, &compl);
1170                         break;
1171                 case NVME_OPC_SET_FEATURES:
1172                         DPRINTF("%s command SET_FEATURES", __func__);
1173                         nvme_opc_set_features(sc, cmd, &compl);
1174                         break;
1175                 case NVME_OPC_GET_FEATURES:
1176                         DPRINTF("%s command GET_FEATURES", __func__);
1177                         nvme_opc_get_features(sc, cmd, &compl);
1178                         break;
1179                 case NVME_OPC_ASYNC_EVENT_REQUEST:
1180                         DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1181                         /* XXX dont care, unhandled for now
1182                         nvme_opc_async_event_req(sc, cmd, &compl);
1183                         */
1184                         compl.status = NVME_NO_STATUS;
1185                         break;
1186                 default:
1187                         WPRINTF("0x%x command is not implemented",
1188                             cmd->opc);
1189                         pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1190                 }
1191                 sqhead = (sqhead + 1) % sq->size;
1192
1193                 if (NVME_COMPLETION_VALID(compl)) {
1194                         struct nvme_completion *cp;
1195                         int phase;
1196
1197                         pthread_mutex_lock(&cq->mtx);
1198
1199                         cp = &(cq->qbase)[cq->tail];
1200                         cp->cdw0 = compl.cdw0;
1201                         cp->sqid = 0;
1202                         cp->sqhd = sqhead;
1203                         cp->cid = cmd->cid;
1204
1205                         phase = NVME_STATUS_GET_P(cp->status);
1206                         cp->status = compl.status;
1207                         pci_nvme_toggle_phase(&cp->status, phase);
1208
1209                         cq->tail = (cq->tail + 1) % cq->size;
1210
1211                         pthread_mutex_unlock(&cq->mtx);
1212                 }
1213         }
1214
1215         DPRINTF("setting sqhead %u", sqhead);
1216         sq->head = sqhead;
1217
1218         if (cq->head != cq->tail)
1219                 pci_generate_msix(sc->nsc_pi, 0);
1220
1221         pthread_mutex_unlock(&sq->mtx);
1222 }
1223
1224 static int
1225 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1226         uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1227 {
1228         int iovidx;
1229
1230         if (req != NULL) {
1231                 /* concatenate contig block-iovs to minimize number of iovs */
1232                 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1233                         iovidx = req->io_req.br_iovcnt - 1;
1234
1235                         req->io_req.br_iov[iovidx].iov_base =
1236                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1237                                              req->prev_gpaddr, size);
1238
1239                         req->prev_size += size;
1240                         req->io_req.br_resid += size;
1241
1242                         req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1243                 } else {
1244                         pthread_mutex_lock(&req->mtx);
1245
1246                         iovidx = req->io_req.br_iovcnt;
1247                         if (iovidx == NVME_MAX_BLOCKIOVS) {
1248                                 int err = 0;
1249
1250                                 DPRINTF("large I/O, doing partial req");
1251
1252                                 iovidx = 0;
1253                                 req->io_req.br_iovcnt = 0;
1254
1255                                 req->io_req.br_callback = pci_nvme_io_partial;
1256
1257                                 if (!do_write)
1258                                         err = blockif_read(sc->nvstore.ctx,
1259                                                            &req->io_req);
1260                                 else
1261                                         err = blockif_write(sc->nvstore.ctx,
1262                                                             &req->io_req);
1263
1264                                 /* wait until req completes before cont */
1265                                 if (err == 0)
1266                                         pthread_cond_wait(&req->cv, &req->mtx);
1267                         }
1268                         if (iovidx == 0) {
1269                                 req->io_req.br_offset = lba;
1270                                 req->io_req.br_resid = 0;
1271                                 req->io_req.br_param = req;
1272                         }
1273
1274                         req->io_req.br_iov[iovidx].iov_base =
1275                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1276                                              gpaddr, size);
1277
1278                         req->io_req.br_iov[iovidx].iov_len = size;
1279
1280                         req->prev_gpaddr = gpaddr;
1281                         req->prev_size = size;
1282                         req->io_req.br_resid += size;
1283
1284                         req->io_req.br_iovcnt++;
1285
1286                         pthread_mutex_unlock(&req->mtx);
1287                 }
1288         } else {
1289                 /* RAM buffer: read/write directly */
1290                 void *p = sc->nvstore.ctx;
1291                 void *gptr;
1292
1293                 if ((lba + size) > sc->nvstore.size) {
1294                         WPRINTF("%s write would overflow RAM", __func__);
1295                         return (-1);
1296                 }
1297
1298                 p = (void *)((uintptr_t)p + (uintptr_t)lba);
1299                 gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1300                 if (do_write) 
1301                         memcpy(p, gptr, size);
1302                 else
1303                         memcpy(gptr, p, size);
1304         }
1305         return (0);
1306 }
1307
1308 static void
1309 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1310         struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1311         uint32_t cdw0, uint16_t status)
1312 {
1313         struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1314         struct nvme_completion *compl;
1315         int phase;
1316
1317         DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1318                  __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1319                  NVME_STATUS_GET_SC(status));
1320
1321         pthread_mutex_lock(&cq->mtx);
1322
1323         assert(cq->qbase != NULL);
1324
1325         compl = &cq->qbase[cq->tail];
1326
1327         compl->cdw0 = cdw0;
1328         compl->sqid = sqid;
1329         compl->sqhd = sq->head;
1330         compl->cid = cid;
1331
1332         // toggle phase
1333         phase = NVME_STATUS_GET_P(compl->status);
1334         compl->status = status;
1335         pci_nvme_toggle_phase(&compl->status, phase);
1336
1337         cq->tail = (cq->tail + 1) % cq->size;
1338
1339         pthread_mutex_unlock(&cq->mtx);
1340
1341         if (cq->head != cq->tail) {
1342                 if (cq->intr_en & NVME_CQ_INTEN) {
1343                         pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1344                 } else {
1345                         DPRINTF("%s: CQ%u interrupt disabled",
1346                                                 __func__, sq->cqid);
1347                 }
1348         }
1349 }
1350
1351 static void
1352 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1353 {
1354         req->sc = NULL;
1355         req->nvme_sq = NULL;
1356         req->sqid = 0;
1357
1358         pthread_mutex_lock(&sc->mtx);
1359
1360         STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1361         sc->pending_ios--;
1362
1363         /* when no more IO pending, can set to ready if device reset/enabled */
1364         if (sc->pending_ios == 0 &&
1365             NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1366                 sc->regs.csts |= NVME_CSTS_RDY;
1367
1368         pthread_mutex_unlock(&sc->mtx);
1369
1370         sem_post(&sc->iosemlock);
1371 }
1372
1373 static struct pci_nvme_ioreq *
1374 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1375 {
1376         struct pci_nvme_ioreq *req = NULL;;
1377
1378         sem_wait(&sc->iosemlock);
1379         pthread_mutex_lock(&sc->mtx);
1380
1381         req = STAILQ_FIRST(&sc->ioreqs_free);
1382         assert(req != NULL);
1383         STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1384
1385         req->sc = sc;
1386
1387         sc->pending_ios++;
1388
1389         pthread_mutex_unlock(&sc->mtx);
1390
1391         req->io_req.br_iovcnt = 0;
1392         req->io_req.br_offset = 0;
1393         req->io_req.br_resid = 0;
1394         req->io_req.br_param = req;
1395         req->prev_gpaddr = 0;
1396         req->prev_size = 0;
1397
1398         return req;
1399 }
1400
1401 static void
1402 pci_nvme_io_done(struct blockif_req *br, int err)
1403 {
1404         struct pci_nvme_ioreq *req = br->br_param;
1405         struct nvme_submission_queue *sq = req->nvme_sq;
1406         uint16_t code, status;
1407
1408         DPRINTF("%s error %d %s", __func__, err, strerror(err));
1409
1410         /* TODO return correct error */
1411         code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1412         pci_nvme_status_genc(&status, code);
1413
1414         pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1415         pci_nvme_release_ioreq(req->sc, req);
1416 }
1417
1418 static void
1419 pci_nvme_io_partial(struct blockif_req *br, int err)
1420 {
1421         struct pci_nvme_ioreq *req = br->br_param;
1422
1423         DPRINTF("%s error %d %s", __func__, err, strerror(err));
1424
1425         pthread_cond_signal(&req->cv);
1426 }
1427
1428 /*
1429  * Implements the Flush command. The specification states:
1430  *    If a volatile write cache is not present, Flush commands complete
1431  *    successfully and have no effect
1432  * in the description of the Volatile Write Cache (VWC) field of the Identify
1433  * Controller data. Therefore, set status to Success if the command is
1434  * not supported (i.e. RAM or as indicated by the blockif).
1435  */
1436 static bool
1437 nvme_opc_flush(struct pci_nvme_softc *sc,
1438     struct nvme_command *cmd,
1439     struct pci_nvme_blockstore *nvstore,
1440     struct pci_nvme_ioreq *req,
1441     uint16_t *status)
1442 {
1443         bool pending = false;
1444
1445         if (nvstore->type == NVME_STOR_RAM) {
1446                 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1447         } else {
1448                 int err;
1449
1450                 req->io_req.br_callback = pci_nvme_io_done;
1451
1452                 err = blockif_flush(nvstore->ctx, &req->io_req);
1453                 switch (err) {
1454                 case 0:
1455                         pending = true;
1456                         break;
1457                 case EOPNOTSUPP:
1458                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1459                         break;
1460                 default:
1461                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1462                 }
1463         }
1464
1465         return (pending);
1466 }
1467
1468 static bool
1469 nvme_opc_write_read(struct pci_nvme_softc *sc,
1470     struct nvme_command *cmd,
1471     struct pci_nvme_blockstore *nvstore,
1472     struct pci_nvme_ioreq *req,
1473     uint16_t *status)
1474 {
1475         uint64_t lba, nblocks, bytes;
1476         size_t offset;
1477         bool is_write = cmd->opc == NVME_OPC_WRITE;
1478         bool pending = false;
1479
1480         lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1481         nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1482
1483         offset = lba * nvstore->sectsz;
1484         bytes  = nblocks * nvstore->sectsz;
1485
1486         if ((offset + bytes) > nvstore->size) {
1487                 WPRINTF("%s command would exceed LBA range", __func__);
1488                 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
1489                 goto out;
1490         }
1491
1492         req->io_req.br_offset = lba;
1493
1494         /* PRP bits 1:0 must be zero */
1495         cmd->prp1 &= ~0x3UL;
1496         cmd->prp2 &= ~0x3UL;
1497
1498         if (nvstore->type == NVME_STOR_RAM) {
1499                 uint8_t *buf = nvstore->ctx;
1500                 enum nvme_copy_dir dir;
1501
1502                 if (is_write)
1503                         dir = NVME_COPY_TO_PRP;
1504                 else
1505                         dir = NVME_COPY_FROM_PRP;
1506
1507                 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
1508                     buf + offset, bytes, dir))
1509                         pci_nvme_status_genc(status,
1510                             NVME_SC_DATA_TRANSFER_ERROR);
1511                 else
1512                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1513         } else {
1514                 uint64_t size;
1515                 int err;
1516
1517                 size = MIN(PAGE_SIZE - (cmd->prp1 % PAGE_SIZE), bytes);
1518                 if (pci_nvme_append_iov_req(sc, req, cmd->prp1,
1519                     size, is_write, offset)) {
1520                         pci_nvme_status_genc(status,
1521                             NVME_SC_DATA_TRANSFER_ERROR);
1522                         goto out;
1523                 }
1524
1525                 offset += size;
1526                 bytes  -= size;
1527
1528                 if (bytes == 0) {
1529                         ;
1530                 } else if (bytes <= PAGE_SIZE) {
1531                         size = bytes;
1532                         if (pci_nvme_append_iov_req(sc, req, cmd->prp2,
1533                             size, is_write, offset)) {
1534                                 pci_nvme_status_genc(status,
1535                                     NVME_SC_DATA_TRANSFER_ERROR);
1536                                 goto out;
1537                         }
1538                 } else {
1539                         void *vmctx = sc->nsc_pi->pi_vmctx;
1540                         uint64_t *prp_list = &cmd->prp2;
1541                         uint64_t *last = prp_list;
1542
1543                         /* PRP2 is pointer to a physical region page list */
1544                         while (bytes) {
1545                                 /* Last entry in list points to the next list */
1546                                 if (prp_list == last) {
1547                                         uint64_t prp = *prp_list;
1548
1549                                         prp_list = paddr_guest2host(vmctx, prp,
1550                                             PAGE_SIZE - (prp % PAGE_SIZE));
1551                                         last = prp_list + (NVME_PRP2_ITEMS - 1);
1552                                 }
1553
1554                                 size = MIN(bytes, PAGE_SIZE);
1555
1556                                 if (pci_nvme_append_iov_req(sc, req, *prp_list,
1557                                     size, is_write, offset)) {
1558                                         pci_nvme_status_genc(status,
1559                                             NVME_SC_DATA_TRANSFER_ERROR);
1560                                         goto out;
1561                                 }
1562
1563                                 offset += size;
1564                                 bytes  -= size;
1565
1566                                 prp_list++;
1567                         }
1568                 }
1569                 req->io_req.br_callback = pci_nvme_io_done;
1570                 if (is_write)
1571                         err = blockif_write(nvstore->ctx, &req->io_req);
1572                 else
1573                         err = blockif_read(nvstore->ctx, &req->io_req);
1574
1575                 if (err)
1576                         pci_nvme_status_genc(status, NVME_SC_DATA_TRANSFER_ERROR);
1577                 else
1578                         pending = true;
1579         }
1580 out:
1581         return (pending);
1582 }
1583
1584 static void
1585 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
1586 {
1587         struct pci_nvme_ioreq *req = br->br_param;
1588         struct pci_nvme_softc *sc = req->sc;
1589         bool done = true;
1590         uint16_t status;
1591
1592         if (err) {
1593                 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
1594         } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
1595                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1596         } else {
1597                 struct iovec *iov = req->io_req.br_iov;
1598
1599                 req->prev_gpaddr++;
1600                 iov += req->prev_gpaddr;
1601
1602                 /* The iov_* values already include the sector size */
1603                 req->io_req.br_offset = (off_t)iov->iov_base;
1604                 req->io_req.br_resid = iov->iov_len;
1605                 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
1606                         pci_nvme_status_genc(&status,
1607                             NVME_SC_INTERNAL_DEVICE_ERROR);
1608                 } else
1609                         done = false;
1610         }
1611
1612         if (done) {
1613                 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
1614                     req->cid, 0, status);
1615                 pci_nvme_release_ioreq(sc, req);
1616         }
1617 }
1618
1619 static bool
1620 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
1621     struct nvme_command *cmd,
1622     struct pci_nvme_blockstore *nvstore,
1623     struct pci_nvme_ioreq *req,
1624     uint16_t *status)
1625 {
1626         int err;
1627         bool pending = false;
1628
1629         if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
1630                 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
1631                 goto out;
1632         }
1633
1634         if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
1635                 struct nvme_dsm_range *range;
1636                 uint32_t nr, r;
1637                 int sectsz = sc->nvstore.sectsz;
1638
1639                 /*
1640                  * DSM calls are advisory only, and compliant controllers
1641                  * may choose to take no actions (i.e. return Success).
1642                  */
1643                 if (!nvstore->deallocate) {
1644                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1645                         goto out;
1646                 }
1647
1648                 if (req == NULL) {
1649                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1650                         goto out;
1651                 }
1652
1653                 /* copy locally because a range entry could straddle PRPs */
1654                 range = calloc(1, NVME_MAX_DSM_TRIM);
1655                 if (range == NULL) {
1656                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1657                         goto out;
1658                 }
1659                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
1660                     (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
1661
1662                 /*
1663                  * If the request is for more than a single range, store
1664                  * the ranges in the br_iov. Optimize for the common case
1665                  * of a single range.
1666                  *
1667                  * Note that NVMe Number of Ranges is a zero based value
1668                  */
1669                 nr = cmd->cdw10 & 0xff;
1670
1671                 req->io_req.br_iovcnt = 0;
1672                 req->io_req.br_offset = range[0].starting_lba * sectsz;
1673                 req->io_req.br_resid = range[0].length * sectsz;
1674
1675                 if (nr == 0) {
1676                         req->io_req.br_callback = pci_nvme_io_done;
1677                 } else {
1678                         struct iovec *iov = req->io_req.br_iov;
1679
1680                         for (r = 0; r <= nr; r++) {
1681                                 iov[r].iov_base = (void *)(range[r].starting_lba * sectsz);
1682                                 iov[r].iov_len = range[r].length * sectsz;
1683                         }
1684                         req->io_req.br_callback = pci_nvme_dealloc_sm;
1685
1686                         /*
1687                          * Use prev_gpaddr to track the current entry and
1688                          * prev_size to track the number of entries
1689                          */
1690                         req->prev_gpaddr = 0;
1691                         req->prev_size = r;
1692                 }
1693
1694                 err = blockif_delete(nvstore->ctx, &req->io_req);
1695                 if (err)
1696                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1697                 else
1698                         pending = true;
1699
1700                 free(range);
1701         }
1702 out:
1703         return (pending);
1704 }
1705
1706 static void
1707 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1708 {
1709         struct nvme_submission_queue *sq;
1710         uint16_t status;
1711         uint16_t sqhead;
1712
1713         /* handle all submissions up to sq->tail index */
1714         sq = &sc->submit_queues[idx];
1715
1716         pthread_mutex_lock(&sq->mtx);
1717
1718         sqhead = sq->head;
1719         DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
1720                  idx, sqhead, sq->tail, sq->qbase);
1721
1722         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1723                 struct nvme_command *cmd;
1724                 struct pci_nvme_ioreq *req;
1725                 uint32_t nsid;
1726                 bool pending;
1727
1728                 pending = false;
1729                 req = NULL;
1730                 status = 0;
1731
1732                 cmd = &sq->qbase[sqhead];
1733                 sqhead = (sqhead + 1) % sq->size;
1734
1735                 nsid = le32toh(cmd->nsid);
1736                 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
1737                         pci_nvme_status_genc(&status,
1738                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1739                         status |=
1740                             NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
1741                         goto complete;
1742                 }
1743
1744                 req = pci_nvme_get_ioreq(sc);
1745                 if (req == NULL) {
1746                         pci_nvme_status_genc(&status,
1747                             NVME_SC_INTERNAL_DEVICE_ERROR);
1748                         WPRINTF("%s: unable to allocate IO req", __func__);
1749                         goto complete;
1750                 }
1751                 req->nvme_sq = sq;
1752                 req->sqid = idx;
1753                 req->opc = cmd->opc;
1754                 req->cid = cmd->cid;
1755                 req->nsid = cmd->nsid;
1756
1757                 switch (cmd->opc) {
1758                 case NVME_OPC_FLUSH:
1759                         pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
1760                             req, &status);
1761                         break;
1762                 case NVME_OPC_WRITE:
1763                 case NVME_OPC_READ:
1764                         pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
1765                             req, &status);
1766                         break;
1767                 case NVME_OPC_WRITE_ZEROES:
1768                         /* TODO: write zeroes
1769                         WPRINTF("%s write zeroes lba 0x%lx blocks %u",
1770                                 __func__, lba, cmd->cdw12 & 0xFFFF); */
1771                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1772                         break;
1773                 case NVME_OPC_DATASET_MANAGEMENT:
1774                         pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
1775                             req, &status);
1776                         break;
1777                 default:
1778                         WPRINTF("%s unhandled io command 0x%x",
1779                             __func__, cmd->opc);
1780                         pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
1781                 }
1782 complete:
1783                 if (!pending) {
1784                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1785                             status);
1786                         if (req != NULL)
1787                                 pci_nvme_release_ioreq(sc, req);
1788                 }
1789         }
1790
1791         sq->head = sqhead;
1792
1793         pthread_mutex_unlock(&sq->mtx);
1794 }
1795
1796 static void
1797 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1798         uint64_t idx, int is_sq, uint64_t value)
1799 {
1800         DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
1801                 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
1802
1803         if (is_sq) {
1804                 if (idx > sc->num_squeues) {
1805                         WPRINTF("%s queue index %lu overflow from "
1806                                  "guest (max %u)",
1807                                  __func__, idx, sc->num_squeues);
1808                         return;
1809                 }
1810
1811                 atomic_store_short(&sc->submit_queues[idx].tail,
1812                                    (uint16_t)value);
1813
1814                 if (idx == 0) {
1815                         pci_nvme_handle_admin_cmd(sc, value);
1816                 } else {
1817                         /* submission queue; handle new entries in SQ */
1818                         if (idx > sc->num_squeues) {
1819                                 WPRINTF("%s SQ index %lu overflow from "
1820                                          "guest (max %u)",
1821                                          __func__, idx, sc->num_squeues);
1822                                 return;
1823                         }
1824                         pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1825                 }
1826         } else {
1827                 if (idx > sc->num_cqueues) {
1828                         WPRINTF("%s queue index %lu overflow from "
1829                                  "guest (max %u)",
1830                                  __func__, idx, sc->num_cqueues);
1831                         return;
1832                 }
1833
1834                 atomic_store_short(&sc->compl_queues[idx].head,
1835                                 (uint16_t)value);
1836         }
1837 }
1838
1839 static void
1840 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1841 {
1842         const char *s = iswrite ? "WRITE" : "READ";
1843
1844         switch (offset) {
1845         case NVME_CR_CAP_LOW:
1846                 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
1847                 break;
1848         case NVME_CR_CAP_HI:
1849                 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
1850                 break;
1851         case NVME_CR_VS:
1852                 DPRINTF("%s %s NVME_CR_VS", func, s);
1853                 break;
1854         case NVME_CR_INTMS:
1855                 DPRINTF("%s %s NVME_CR_INTMS", func, s);
1856                 break;
1857         case NVME_CR_INTMC:
1858                 DPRINTF("%s %s NVME_CR_INTMC", func, s);
1859                 break;
1860         case NVME_CR_CC:
1861                 DPRINTF("%s %s NVME_CR_CC", func, s);
1862                 break;
1863         case NVME_CR_CSTS:
1864                 DPRINTF("%s %s NVME_CR_CSTS", func, s);
1865                 break;
1866         case NVME_CR_NSSR:
1867                 DPRINTF("%s %s NVME_CR_NSSR", func, s);
1868                 break;
1869         case NVME_CR_AQA:
1870                 DPRINTF("%s %s NVME_CR_AQA", func, s);
1871                 break;
1872         case NVME_CR_ASQ_LOW:
1873                 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
1874                 break;
1875         case NVME_CR_ASQ_HI:
1876                 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
1877                 break;
1878         case NVME_CR_ACQ_LOW:
1879                 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
1880                 break;
1881         case NVME_CR_ACQ_HI:
1882                 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
1883                 break;
1884         default:
1885                 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
1886         }
1887
1888 }
1889
1890 static void
1891 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1892         uint64_t offset, int size, uint64_t value)
1893 {
1894         uint32_t ccreg;
1895
1896         if (offset >= NVME_DOORBELL_OFFSET) {
1897                 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1898                 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1899                 int is_sq = (belloffset % 8) < 4;
1900
1901                 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1902                         WPRINTF("guest attempted an overflow write offset "
1903                                  "0x%lx, val 0x%lx in %s",
1904                                  offset, value, __func__);
1905                         return;
1906                 }
1907
1908                 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1909                 return;
1910         }
1911
1912         DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
1913                 offset, size, value);
1914
1915         if (size != 4) {
1916                 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
1917                          "val 0x%lx) to bar0 in %s",
1918                          size, offset, value, __func__);
1919                 /* TODO: shutdown device */
1920                 return;
1921         }
1922
1923         pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1924
1925         pthread_mutex_lock(&sc->mtx);
1926
1927         switch (offset) {
1928         case NVME_CR_CAP_LOW:
1929         case NVME_CR_CAP_HI:
1930                 /* readonly */
1931                 break;
1932         case NVME_CR_VS:
1933                 /* readonly */
1934                 break;
1935         case NVME_CR_INTMS:
1936                 /* MSI-X, so ignore */
1937                 break;
1938         case NVME_CR_INTMC:
1939                 /* MSI-X, so ignore */
1940                 break;
1941         case NVME_CR_CC:
1942                 ccreg = (uint32_t)value;
1943
1944                 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1945                          "iocqes %u",
1946                         __func__,
1947                          NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1948                          NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1949                          NVME_CC_GET_IOCQES(ccreg));
1950
1951                 if (NVME_CC_GET_SHN(ccreg)) {
1952                         /* perform shutdown - flush out data to backend */
1953                         sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1954                             NVME_CSTS_REG_SHST_SHIFT);
1955                         sc->regs.csts |= NVME_SHST_COMPLETE <<
1956                             NVME_CSTS_REG_SHST_SHIFT;
1957                 }
1958                 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1959                         if (NVME_CC_GET_EN(ccreg) == 0)
1960                                 /* transition 1-> causes controller reset */
1961                                 pci_nvme_reset_locked(sc);
1962                         else
1963                                 pci_nvme_init_controller(ctx, sc);
1964                 }
1965
1966                 /* Insert the iocqes, iosqes and en bits from the write */
1967                 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1968                 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1969                 if (NVME_CC_GET_EN(ccreg) == 0) {
1970                         /* Insert the ams, mps and css bit fields */
1971                         sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1972                         sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1973                         sc->regs.csts &= ~NVME_CSTS_RDY;
1974                 } else if (sc->pending_ios == 0) {
1975                         sc->regs.csts |= NVME_CSTS_RDY;
1976                 }
1977                 break;
1978         case NVME_CR_CSTS:
1979                 break;
1980         case NVME_CR_NSSR:
1981                 /* ignore writes; don't support subsystem reset */
1982                 break;
1983         case NVME_CR_AQA:
1984                 sc->regs.aqa = (uint32_t)value;
1985                 break;
1986         case NVME_CR_ASQ_LOW:
1987                 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
1988                                (0xFFFFF000 & value);
1989                 break;
1990         case NVME_CR_ASQ_HI:
1991                 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
1992                                (value << 32);
1993                 break;
1994         case NVME_CR_ACQ_LOW:
1995                 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
1996                                (0xFFFFF000 & value);
1997                 break;
1998         case NVME_CR_ACQ_HI:
1999                 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2000                                (value << 32);
2001                 break;
2002         default:
2003                 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2004                          __func__, offset, value, size);
2005         }
2006         pthread_mutex_unlock(&sc->mtx);
2007 }
2008
2009 static void
2010 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2011                 int baridx, uint64_t offset, int size, uint64_t value)
2012 {
2013         struct pci_nvme_softc* sc = pi->pi_arg;
2014
2015         if (baridx == pci_msix_table_bar(pi) ||
2016             baridx == pci_msix_pba_bar(pi)) {
2017                 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2018                          " value 0x%lx", baridx, offset, size, value);
2019
2020                 pci_emul_msix_twrite(pi, offset, size, value);
2021                 return;
2022         }
2023
2024         switch (baridx) {
2025         case 0:
2026                 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2027                 break;
2028
2029         default:
2030                 DPRINTF("%s unknown baridx %d, val 0x%lx",
2031                          __func__, baridx, value);
2032         }
2033 }
2034
2035 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2036         uint64_t offset, int size)
2037 {
2038         uint64_t value;
2039
2040         pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2041
2042         if (offset < NVME_DOORBELL_OFFSET) {
2043                 void *p = &(sc->regs);
2044                 pthread_mutex_lock(&sc->mtx);
2045                 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2046                 pthread_mutex_unlock(&sc->mtx);
2047         } else {
2048                 value = 0;
2049                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2050         }
2051
2052         switch (size) {
2053         case 1:
2054                 value &= 0xFF;
2055                 break;
2056         case 2:
2057                 value &= 0xFFFF;
2058                 break;
2059         case 4:
2060                 value &= 0xFFFFFFFF;
2061                 break;
2062         }
2063
2064         DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
2065                  offset, size, (uint32_t)value);
2066
2067         return (value);
2068 }
2069
2070
2071
2072 static uint64_t
2073 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2074     uint64_t offset, int size)
2075 {
2076         struct pci_nvme_softc* sc = pi->pi_arg;
2077
2078         if (baridx == pci_msix_table_bar(pi) ||
2079             baridx == pci_msix_pba_bar(pi)) {
2080                 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2081                         baridx, offset, size);
2082
2083                 return pci_emul_msix_tread(pi, offset, size);
2084         }
2085
2086         switch (baridx) {
2087         case 0:
2088                 return pci_nvme_read_bar_0(sc, offset, size);
2089
2090         default:
2091                 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2092         }
2093
2094         return (0);
2095 }
2096
2097
2098 static int
2099 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2100 {
2101         char bident[sizeof("XX:X:X")];
2102         char    *uopt, *xopts, *config;
2103         uint32_t sectsz;
2104         int optidx;
2105
2106         sc->max_queues = NVME_QUEUES;
2107         sc->max_qentries = NVME_MAX_QENTRIES;
2108         sc->ioslots = NVME_IOSLOTS;
2109         sc->num_squeues = sc->max_queues;
2110         sc->num_cqueues = sc->max_queues;
2111         sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2112         sectsz = 0;
2113
2114         uopt = strdup(opts);
2115         optidx = 0;
2116         snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2117                  "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2118         for (xopts = strtok(uopt, ",");
2119              xopts != NULL;
2120              xopts = strtok(NULL, ",")) {
2121
2122                 if ((config = strchr(xopts, '=')) != NULL)
2123                         *config++ = '\0';
2124
2125                 if (!strcmp("maxq", xopts)) {
2126                         sc->max_queues = atoi(config);
2127                 } else if (!strcmp("qsz", xopts)) {
2128                         sc->max_qentries = atoi(config);
2129                 } else if (!strcmp("ioslots", xopts)) {
2130                         sc->ioslots = atoi(config);
2131                 } else if (!strcmp("sectsz", xopts)) {
2132                         sectsz = atoi(config);
2133                 } else if (!strcmp("ser", xopts)) {
2134                         /*
2135                          * This field indicates the Product Serial Number in
2136                          * 7-bit ASCII, unused bytes should be space characters.
2137                          * Ref: NVMe v1.3c.
2138                          */
2139                         cpywithpad((char *)sc->ctrldata.sn,
2140                                    sizeof(sc->ctrldata.sn), config, ' ');
2141                 } else if (!strcmp("ram", xopts)) {
2142                         uint64_t sz = strtoull(&xopts[4], NULL, 10);
2143
2144                         sc->nvstore.type = NVME_STOR_RAM;
2145                         sc->nvstore.size = sz * 1024 * 1024;
2146                         sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2147                         sc->nvstore.sectsz = 4096;
2148                         sc->nvstore.sectsz_bits = 12;
2149                         if (sc->nvstore.ctx == NULL) {
2150                                 perror("Unable to allocate RAM");
2151                                 free(uopt);
2152                                 return (-1);
2153                         }
2154                 } else if (!strcmp("eui64", xopts)) {
2155                         sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2156                 } else if (!strcmp("dsm", xopts)) {
2157                         if (!strcmp("auto", config))
2158                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2159                         else if (!strcmp("enable", config))
2160                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2161                         else if (!strcmp("disable", config))
2162                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2163                 } else if (optidx == 0) {
2164                         snprintf(bident, sizeof(bident), "%d:%d",
2165                                  sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2166                         sc->nvstore.ctx = blockif_open(xopts, bident);
2167                         if (sc->nvstore.ctx == NULL) {
2168                                 perror("Could not open backing file");
2169                                 free(uopt);
2170                                 return (-1);
2171                         }
2172                         sc->nvstore.type = NVME_STOR_BLOCKIF;
2173                         sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2174                 } else {
2175                         EPRINTLN("Invalid option %s", xopts);
2176                         free(uopt);
2177                         return (-1);
2178                 }
2179
2180                 optidx++;
2181         }
2182         free(uopt);
2183
2184         if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2185                 EPRINTLN("backing store not specified");
2186                 return (-1);
2187         }
2188         if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2189                 sc->nvstore.sectsz = sectsz;
2190         else if (sc->nvstore.type != NVME_STOR_RAM)
2191                 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2192         for (sc->nvstore.sectsz_bits = 9;
2193              (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2194              sc->nvstore.sectsz_bits++);
2195
2196         if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2197                 sc->max_queues = NVME_QUEUES;
2198
2199         if (sc->max_qentries <= 0) {
2200                 EPRINTLN("Invalid qsz option");
2201                 return (-1);
2202         }
2203         if (sc->ioslots <= 0) {
2204                 EPRINTLN("Invalid ioslots option");
2205                 return (-1);
2206         }
2207
2208         return (0);
2209 }
2210
2211 static int
2212 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2213 {
2214         struct pci_nvme_softc *sc;
2215         uint32_t pci_membar_sz;
2216         int     error;
2217
2218         error = 0;
2219
2220         sc = calloc(1, sizeof(struct pci_nvme_softc));
2221         pi->pi_arg = sc;
2222         sc->nsc_pi = pi;
2223
2224         error = pci_nvme_parse_opts(sc, opts);
2225         if (error < 0)
2226                 goto done;
2227         else
2228                 error = 0;
2229
2230         STAILQ_INIT(&sc->ioreqs_free);
2231         sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2232         for (int i = 0; i < sc->ioslots; i++) {
2233                 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2234                 pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
2235                 pthread_cond_init(&sc->ioreqs[i].cv, NULL);
2236         }
2237         sc->intr_coales_aggr_thresh = 1;
2238
2239         pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2240         pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2241         pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2242         pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2243         pci_set_cfgdata8(pi, PCIR_PROGIF,
2244                          PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2245
2246         /*
2247          * Allocate size of NVMe registers + doorbell space for all queues.
2248          *
2249          * The specification requires a minimum memory I/O window size of 16K.
2250          * The Windows driver will refuse to start a device with a smaller
2251          * window.
2252          */
2253         pci_membar_sz = sizeof(struct nvme_registers) +
2254             2 * sizeof(uint32_t) * (sc->max_queues + 1);
2255         pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2256
2257         DPRINTF("nvme membar size: %u", pci_membar_sz);
2258
2259         error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2260         if (error) {
2261                 WPRINTF("%s pci alloc mem bar failed", __func__);
2262                 goto done;
2263         }
2264
2265         error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2266         if (error) {
2267                 WPRINTF("%s pci add msixcap failed", __func__);
2268                 goto done;
2269         }
2270
2271         error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2272         if (error) {
2273                 WPRINTF("%s pci add Express capability failed", __func__);
2274                 goto done;
2275         }
2276
2277         pthread_mutex_init(&sc->mtx, NULL);
2278         sem_init(&sc->iosemlock, 0, sc->ioslots);
2279
2280         pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2281         /*
2282          * Controller data depends on Namespace data so initialize Namespace
2283          * data first.
2284          */
2285         pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2286         pci_nvme_init_ctrldata(sc);
2287         pci_nvme_init_logpages(sc);
2288
2289         pci_nvme_reset(sc);
2290
2291         pci_lintr_request(pi);
2292
2293 done:
2294         return (error);
2295 }
2296
2297
2298 struct pci_devemu pci_de_nvme = {
2299         .pe_emu =       "nvme",
2300         .pe_init =      pci_nvme_init,
2301         .pe_barwrite =  pci_nvme_write,
2302         .pe_barread =   pci_nvme_read
2303 };
2304 PCI_EMUL_SET(pci_de_nvme);