]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/bhyve/pci_nvme.c
MFC r362752 bhyve: fix NVMe queue creation and deletion
[FreeBSD/FreeBSD.git] / usr.sbin / bhyve / pci_nvme.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov 
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65
66 #include <assert.h>
67 #include <pthread.h>
68 #include <semaphore.h>
69 #include <stdbool.h>
70 #include <stddef.h>
71 #include <stdint.h>
72 #include <stdio.h>
73 #include <stdlib.h>
74 #include <string.h>
75
76 #include <machine/atomic.h>
77 #include <machine/vmm.h>
78 #include <vmmapi.h>
79
80 #include <dev/nvme/nvme.h>
81
82 #include "bhyverun.h"
83 #include "block_if.h"
84 #include "debug.h"
85 #include "pci_emul.h"
86
87
88 static int nvme_debug = 0;
89 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
90 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
91
92 /* defaults; can be overridden */
93 #define NVME_MSIX_BAR           4
94
95 #define NVME_IOSLOTS            8
96
97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
98 #define NVME_MMIO_SPACE_MIN     (1 << 14)
99
100 #define NVME_QUEUES             16
101 #define NVME_MAX_QENTRIES       2048
102
103 #define NVME_PRP2_ITEMS         (PAGE_SIZE/sizeof(uint64_t))
104 #define NVME_MAX_BLOCKIOVS      512
105
106 /* This is a synthetic status code to indicate there is no status */
107 #define NVME_NO_STATUS          0xffff
108 #define NVME_COMPLETION_VALID(c)        ((c).status != NVME_NO_STATUS)
109
110 /* helpers */
111
112 /* Convert a zero-based value into a one-based value */
113 #define ONE_BASED(zero)         ((zero) + 1)
114 /* Convert a one-based value into a zero-based value */
115 #define ZERO_BASED(one)         ((one)  - 1)
116
117 /* Encode number of SQ's and CQ's for Set/Get Features */
118 #define NVME_FEATURE_NUM_QUEUES(sc) \
119         (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
120         (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
121
122 #define NVME_DOORBELL_OFFSET    offsetof(struct nvme_registers, doorbell)
123
124 enum nvme_controller_register_offsets {
125         NVME_CR_CAP_LOW = 0x00,
126         NVME_CR_CAP_HI  = 0x04,
127         NVME_CR_VS      = 0x08,
128         NVME_CR_INTMS   = 0x0c,
129         NVME_CR_INTMC   = 0x10,
130         NVME_CR_CC      = 0x14,
131         NVME_CR_CSTS    = 0x1c,
132         NVME_CR_NSSR    = 0x20,
133         NVME_CR_AQA     = 0x24,
134         NVME_CR_ASQ_LOW = 0x28,
135         NVME_CR_ASQ_HI  = 0x2c,
136         NVME_CR_ACQ_LOW = 0x30,
137         NVME_CR_ACQ_HI  = 0x34,
138 };
139
140 enum nvme_cmd_cdw11 {
141         NVME_CMD_CDW11_PC  = 0x0001,
142         NVME_CMD_CDW11_IEN = 0x0002,
143         NVME_CMD_CDW11_IV  = 0xFFFF0000,
144 };
145
146 enum nvme_copy_dir {
147         NVME_COPY_TO_PRP,
148         NVME_COPY_FROM_PRP,
149 };
150
151 #define NVME_CQ_INTEN   0x01
152 #define NVME_CQ_INTCOAL 0x02
153
154 struct nvme_completion_queue {
155         struct nvme_completion *qbase;
156         pthread_mutex_t mtx;
157         uint32_t        size;
158         uint16_t        tail; /* nvme progress */
159         uint16_t        head; /* guest progress */
160         uint16_t        intr_vec;
161         uint32_t        intr_en;
162 };
163
164 struct nvme_submission_queue {
165         struct nvme_command *qbase;
166         pthread_mutex_t mtx;
167         uint32_t        size;
168         uint16_t        head; /* nvme progress */
169         uint16_t        tail; /* guest progress */
170         uint16_t        cqid; /* completion queue id */
171         int             qpriority;
172 };
173
174 enum nvme_storage_type {
175         NVME_STOR_BLOCKIF = 0,
176         NVME_STOR_RAM = 1,
177 };
178
179 struct pci_nvme_blockstore {
180         enum nvme_storage_type type;
181         void            *ctx;
182         uint64_t        size;
183         uint32_t        sectsz;
184         uint32_t        sectsz_bits;
185         uint64_t        eui64;
186         uint32_t        deallocate:1;
187 };
188
189 struct pci_nvme_ioreq {
190         struct pci_nvme_softc *sc;
191         STAILQ_ENTRY(pci_nvme_ioreq) link;
192         struct nvme_submission_queue *nvme_sq;
193         uint16_t        sqid;
194
195         /* command information */
196         uint16_t        opc;
197         uint16_t        cid;
198         uint32_t        nsid;
199
200         uint64_t        prev_gpaddr;
201         size_t          prev_size;
202
203         /*
204          * lock if all iovs consumed (big IO);
205          * complete transaction before continuing
206          */
207         pthread_mutex_t mtx;
208         pthread_cond_t  cv;
209
210         struct blockif_req io_req;
211
212         /* pad to fit up to 512 page descriptors from guest IO request */
213         struct iovec    iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
214 };
215
216 enum nvme_dsm_type {
217         /* Dataset Management bit in ONCS reflects backing storage capability */
218         NVME_DATASET_MANAGEMENT_AUTO,
219         /* Unconditionally set Dataset Management bit in ONCS */
220         NVME_DATASET_MANAGEMENT_ENABLE,
221         /* Unconditionally clear Dataset Management bit in ONCS */
222         NVME_DATASET_MANAGEMENT_DISABLE,
223 };
224
225 struct pci_nvme_softc {
226         struct pci_devinst *nsc_pi;
227
228         pthread_mutex_t mtx;
229
230         struct nvme_registers regs;
231
232         struct nvme_namespace_data  nsdata;
233         struct nvme_controller_data ctrldata;
234         struct nvme_error_information_entry err_log;
235         struct nvme_health_information_page health_log;
236         struct nvme_firmware_page fw_log;
237
238         struct pci_nvme_blockstore nvstore;
239
240         uint16_t        max_qentries;   /* max entries per queue */
241         uint32_t        max_queues;     /* max number of IO SQ's or CQ's */
242         uint32_t        num_cqueues;
243         uint32_t        num_squeues;
244
245         struct pci_nvme_ioreq *ioreqs;
246         STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
247         uint32_t        pending_ios;
248         uint32_t        ioslots;
249         sem_t           iosemlock;
250
251         /*
252          * Memory mapped Submission and Completion queues
253          * Each array includes both Admin and IO queues
254          */
255         struct nvme_completion_queue *compl_queues;
256         struct nvme_submission_queue *submit_queues;
257
258         /* controller features */
259         uint32_t        intr_coales_aggr_time;   /* 0x08: uS to delay intr */
260         uint32_t        intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
261         uint32_t        async_ev_config;         /* 0x0B: async event config */
262
263         enum nvme_dsm_type dataset_management;
264 };
265
266
267 static void pci_nvme_io_partial(struct blockif_req *br, int err);
268
269 /* Controller Configuration utils */
270 #define NVME_CC_GET_EN(cc) \
271         ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
272 #define NVME_CC_GET_CSS(cc) \
273         ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
274 #define NVME_CC_GET_SHN(cc) \
275         ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
276 #define NVME_CC_GET_IOSQES(cc) \
277         ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
278 #define NVME_CC_GET_IOCQES(cc) \
279         ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
280
281 #define NVME_CC_WRITE_MASK \
282         ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
283          (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
284          (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
285
286 #define NVME_CC_NEN_WRITE_MASK \
287         ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
288          (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
289          (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
290
291 /* Controller Status utils */
292 #define NVME_CSTS_GET_RDY(sts) \
293         ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
294
295 #define NVME_CSTS_RDY   (1 << NVME_CSTS_REG_RDY_SHIFT)
296
297 /* Completion Queue status word utils */
298 #define NVME_STATUS_P   (1 << NVME_STATUS_P_SHIFT)
299 #define NVME_STATUS_MASK \
300         ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
301          (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
302
303 #define NVME_ONCS_DSM   (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
304         NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
305
306 static __inline void
307 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
308 {
309         size_t len;
310
311         len = strnlen(src, dst_size);
312         memset(dst, pad, dst_size);
313         memcpy(dst, src, len);
314 }
315
316 static __inline void
317 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
318 {
319
320         *status &= ~NVME_STATUS_MASK;
321         *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
322                 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
323 }
324
325 static __inline void
326 pci_nvme_status_genc(uint16_t *status, uint16_t code)
327 {
328
329         pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
330 }
331
332 /*
333  * Initialize the requested number or IO Submission and Completion Queues.
334  * Admin queues are allocated implicitly.
335  */
336 static void
337 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
338 {
339         uint32_t i;
340
341         /*
342          * Allocate and initialize the Submission Queues
343          */
344         if (nsq > NVME_QUEUES) {
345                 WPRINTF("%s: clamping number of SQ from %u to %u",
346                                         __func__, nsq, NVME_QUEUES);
347                 nsq = NVME_QUEUES;
348         }
349
350         sc->num_squeues = nsq;
351
352         sc->submit_queues = calloc(sc->num_squeues + 1,
353                                 sizeof(struct nvme_submission_queue));
354         if (sc->submit_queues == NULL) {
355                 WPRINTF("%s: SQ allocation failed", __func__);
356                 sc->num_squeues = 0;
357         } else {
358                 struct nvme_submission_queue *sq = sc->submit_queues;
359
360                 for (i = 0; i < sc->num_squeues; i++)
361                         pthread_mutex_init(&sq[i].mtx, NULL);
362         }
363
364         /*
365          * Allocate and initialize the Completion Queues
366          */
367         if (ncq > NVME_QUEUES) {
368                 WPRINTF("%s: clamping number of CQ from %u to %u",
369                                         __func__, ncq, NVME_QUEUES);
370                 ncq = NVME_QUEUES;
371         }
372
373         sc->num_cqueues = ncq;
374
375         sc->compl_queues = calloc(sc->num_cqueues + 1,
376                                 sizeof(struct nvme_completion_queue));
377         if (sc->compl_queues == NULL) {
378                 WPRINTF("%s: CQ allocation failed", __func__);
379                 sc->num_cqueues = 0;
380         } else {
381                 struct nvme_completion_queue *cq = sc->compl_queues;
382
383                 for (i = 0; i < sc->num_cqueues; i++)
384                         pthread_mutex_init(&cq[i].mtx, NULL);
385         }
386 }
387
388 static void
389 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
390 {
391         struct nvme_controller_data *cd = &sc->ctrldata;
392
393         cd->vid = 0xFB5D;
394         cd->ssvid = 0x0000;
395
396         cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
397         cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
398
399         /* Num of submission commands that we can handle at a time (2^rab) */
400         cd->rab   = 4;
401
402         /* FreeBSD OUI */
403         cd->ieee[0] = 0x58;
404         cd->ieee[1] = 0x9c;
405         cd->ieee[2] = 0xfc;
406
407         cd->mic = 0;
408
409         cd->mdts = 9;   /* max data transfer size (2^mdts * CAP.MPSMIN) */
410
411         cd->ver = 0x00010300;
412
413         cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
414         cd->acl = 2;
415         cd->aerl = 4;
416
417         cd->lpa = 0;    /* TODO: support some simple things like SMART */
418         cd->elpe = 0;   /* max error log page entries */
419         cd->npss = 1;   /* number of power states support */
420
421         /* Warning Composite Temperature Threshold */
422         cd->wctemp = 0x0157;
423
424         cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
425             (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
426         cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
427             (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
428         cd->nn = 1;     /* number of namespaces */
429
430         cd->oncs = 0;
431         switch (sc->dataset_management) {
432         case NVME_DATASET_MANAGEMENT_AUTO:
433                 if (sc->nvstore.deallocate)
434                         cd->oncs |= NVME_ONCS_DSM;
435                 break;
436         case NVME_DATASET_MANAGEMENT_ENABLE:
437                 cd->oncs |= NVME_ONCS_DSM;
438                 break;
439         default:
440                 break;
441         }
442
443         cd->fna = 0x03;
444
445         cd->power_state[0].mp = 10;
446 }
447
448 /*
449  * Calculate the CRC-16 of the given buffer
450  * See copyright attribution at top of file
451  */
452 static uint16_t
453 crc16(uint16_t crc, const void *buffer, unsigned int len)
454 {
455         const unsigned char *cp = buffer;
456         /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
457         static uint16_t const crc16_table[256] = {
458                 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
459                 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
460                 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
461                 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
462                 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
463                 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
464                 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
465                 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
466                 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
467                 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
468                 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
469                 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
470                 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
471                 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
472                 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
473                 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
474                 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
475                 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
476                 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
477                 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
478                 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
479                 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
480                 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
481                 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
482                 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
483                 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
484                 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
485                 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
486                 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
487                 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
488                 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
489                 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
490         };
491
492         while (len--)
493                 crc = (((crc >> 8) & 0xffU) ^
494                     crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
495         return crc;
496 }
497
498 static void
499 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
500     struct nvme_namespace_data *nd, uint32_t nsid,
501     struct pci_nvme_blockstore *nvstore)
502 {
503
504         /* Get capacity and block size information from backing store */
505         nd->nsze = nvstore->size / nvstore->sectsz;
506         nd->ncap = nd->nsze;
507         nd->nuse = nd->nsze;
508
509         if (nvstore->type == NVME_STOR_BLOCKIF)
510                 nvstore->deallocate = blockif_candelete(nvstore->ctx);
511
512         nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
513         nd->flbas = 0;
514
515         /* Create an EUI-64 if user did not provide one */
516         if (nvstore->eui64 == 0) {
517                 char *data = NULL;
518                 uint64_t eui64 = nvstore->eui64;
519
520                 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
521                     sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
522
523                 if (data != NULL) {
524                         eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
525                         free(data);
526                 }
527                 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
528         }
529         be64enc(nd->eui64, nvstore->eui64);
530
531         /* LBA data-sz = 2^lbads */
532         nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
533 }
534
535 static void
536 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
537 {
538
539         memset(&sc->err_log, 0, sizeof(sc->err_log));
540         memset(&sc->health_log, 0, sizeof(sc->health_log));
541         memset(&sc->fw_log, 0, sizeof(sc->fw_log));
542 }
543
544 static void
545 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
546 {
547         uint32_t i;
548
549         DPRINTF("%s", __func__);
550
551         sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
552             (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
553             (60 << NVME_CAP_LO_REG_TO_SHIFT);
554
555         sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
556
557         sc->regs.vs = 0x00010300;       /* NVMe v1.3 */
558
559         sc->regs.cc = 0;
560         sc->regs.csts = 0;
561
562         assert(sc->submit_queues != NULL);
563
564         for (i = 0; i < sc->num_squeues + 1; i++) {
565                 sc->submit_queues[i].qbase = NULL;
566                 sc->submit_queues[i].size = 0;
567                 sc->submit_queues[i].cqid = 0;
568                 sc->submit_queues[i].tail = 0;
569                 sc->submit_queues[i].head = 0;
570         }
571
572         assert(sc->compl_queues != NULL);
573
574         for (i = 0; i < sc->num_cqueues + 1; i++) {
575                 sc->compl_queues[i].qbase = NULL;
576                 sc->compl_queues[i].size = 0;
577                 sc->compl_queues[i].tail = 0;
578                 sc->compl_queues[i].head = 0;
579         }
580 }
581
582 static void
583 pci_nvme_reset(struct pci_nvme_softc *sc)
584 {
585         pthread_mutex_lock(&sc->mtx);
586         pci_nvme_reset_locked(sc);
587         pthread_mutex_unlock(&sc->mtx);
588 }
589
590 static void
591 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
592 {
593         uint16_t acqs, asqs;
594
595         DPRINTF("%s", __func__);
596
597         asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
598         sc->submit_queues[0].size = asqs;
599         sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
600                     sizeof(struct nvme_command) * asqs);
601
602         DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
603                 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
604
605         acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 
606             NVME_AQA_REG_ACQS_MASK) + 1;
607         sc->compl_queues[0].size = acqs;
608         sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
609                  sizeof(struct nvme_completion) * acqs);
610
611         DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
612                 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
613 }
614
615 static int
616 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
617         size_t len, enum nvme_copy_dir dir)
618 {
619         uint8_t *p;
620         size_t bytes;
621
622         if (len > (8 * 1024)) {
623                 return (-1);
624         }
625
626         /* Copy from the start of prp1 to the end of the physical page */
627         bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
628         bytes = MIN(bytes, len);
629
630         p = vm_map_gpa(ctx, prp1, bytes);
631         if (p == NULL) {
632                 return (-1);
633         }
634
635         if (dir == NVME_COPY_TO_PRP)
636                 memcpy(p, b, bytes);
637         else
638                 memcpy(b, p, bytes);
639
640         b += bytes;
641
642         len -= bytes;
643         if (len == 0) {
644                 return (0);
645         }
646
647         len = MIN(len, PAGE_SIZE);
648
649         p = vm_map_gpa(ctx, prp2, len);
650         if (p == NULL) {
651                 return (-1);
652         }
653
654         if (dir == NVME_COPY_TO_PRP)
655                 memcpy(p, b, len);
656         else
657                 memcpy(b, p, len);
658
659         return (0);
660 }
661
662 /*
663  * Write a Completion Queue Entry update
664  *
665  * Write the completion and update the doorbell value
666  */
667 static void
668 pci_nvme_cq_update(struct pci_nvme_softc *sc,
669                 struct nvme_completion_queue *cq,
670                 uint32_t cdw0,
671                 uint16_t cid,
672                 uint16_t sqid,
673                 uint16_t status)
674 {
675         struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
676         struct nvme_completion *cqe;
677
678         assert(cq->qbase != NULL);
679
680         pthread_mutex_lock(&cq->mtx);
681
682         cqe = &cq->qbase[cq->tail];
683
684         /* Flip the phase bit */
685         status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
686
687         cqe->cdw0 = cdw0;
688         cqe->sqhd = sq->head;
689         cqe->sqid = sqid;
690         cqe->cid = cid;
691         cqe->status = status;
692
693         cq->tail++;
694         if (cq->tail >= cq->size) {
695                 cq->tail = 0;
696         }
697
698         pthread_mutex_unlock(&cq->mtx);
699 }
700
701 static int
702 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
703         struct nvme_completion* compl)
704 {
705         uint16_t qid = command->cdw10 & 0xffff;
706
707         DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
708         if (qid == 0 || qid > sc->num_squeues ||
709             (sc->submit_queues[qid].qbase == NULL)) {
710                 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
711                         __func__, qid, sc->num_squeues);
712                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
713                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
714                 return (1);
715         }
716
717         sc->submit_queues[qid].qbase = NULL;
718         sc->submit_queues[qid].cqid = 0;
719         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
720         return (1);
721 }
722
723 static int
724 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
725         struct nvme_completion* compl)
726 {
727         if (command->cdw11 & NVME_CMD_CDW11_PC) {
728                 uint16_t qid = command->cdw10 & 0xffff;
729                 struct nvme_submission_queue *nsq;
730
731                 if ((qid == 0) || (qid > sc->num_squeues) ||
732                     (sc->submit_queues[qid].qbase != NULL)) {
733                         WPRINTF("%s queue index %u > num_squeues %u",
734                                 __func__, qid, sc->num_squeues);
735                         pci_nvme_status_tc(&compl->status,
736                             NVME_SCT_COMMAND_SPECIFIC,
737                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
738                         return (1);
739                 }
740
741                 nsq = &sc->submit_queues[qid];
742                 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
743                 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
744                 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
745                         /*
746                          * Queues must specify at least two entries
747                          * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
748                          * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
749                          */
750                         pci_nvme_status_tc(&compl->status,
751                             NVME_SCT_COMMAND_SPECIFIC,
752                             NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
753                         return (1);
754                 }
755
756                 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
757                 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
758                         pci_nvme_status_tc(&compl->status,
759                             NVME_SCT_COMMAND_SPECIFIC,
760                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
761                         return (1);
762                 }
763
764                 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
765                         pci_nvme_status_tc(&compl->status,
766                             NVME_SCT_COMMAND_SPECIFIC,
767                             NVME_SC_COMPLETION_QUEUE_INVALID);
768                         return (1);
769                 }
770
771                 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
772
773                 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
774                               sizeof(struct nvme_command) * (size_t)nsq->size);
775
776                 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
777                         qid, nsq->size, nsq->qbase, nsq->cqid);
778
779                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
780
781                 DPRINTF("%s completed creating IOSQ qid %u",
782                          __func__, qid);
783         } else {
784                 /* 
785                  * Guest sent non-cont submission queue request.
786                  * This setting is unsupported by this emulation.
787                  */
788                 WPRINTF("%s unsupported non-contig (list-based) "
789                          "create i/o submission queue", __func__);
790
791                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
792         }
793         return (1);
794 }
795
796 static int
797 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
798         struct nvme_completion* compl)
799 {
800         uint16_t qid = command->cdw10 & 0xffff;
801         uint16_t sqid;
802
803         DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
804         if (qid == 0 || qid > sc->num_cqueues ||
805             (sc->compl_queues[qid].qbase == NULL)) {
806                 WPRINTF("%s queue index %u / num_cqueues %u",
807                         __func__, qid, sc->num_cqueues);
808                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
809                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
810                 return (1);
811         }
812
813         /* Deleting an Active CQ is an error */
814         for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
815                 if (sc->submit_queues[sqid].cqid == qid) {
816                         pci_nvme_status_tc(&compl->status,
817                             NVME_SCT_COMMAND_SPECIFIC,
818                             NVME_SC_INVALID_QUEUE_DELETION);
819                         return (1);
820                 }
821
822         sc->compl_queues[qid].qbase = NULL;
823         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
824         return (1);
825 }
826
827 static int
828 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
829         struct nvme_completion* compl)
830 {
831         struct nvme_completion_queue *ncq;
832         uint16_t qid = command->cdw10 & 0xffff;
833
834         /* Only support Physically Contiguous queues */
835         if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
836                 WPRINTF("%s unsupported non-contig (list-based) "
837                          "create i/o completion queue",
838                          __func__);
839
840                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
841                 return (1);
842         }
843
844         if ((qid == 0) || (qid > sc->num_cqueues) ||
845             (sc->compl_queues[qid].qbase != NULL)) {
846                 WPRINTF("%s queue index %u > num_cqueues %u",
847                         __func__, qid, sc->num_cqueues);
848                 pci_nvme_status_tc(&compl->status,
849                     NVME_SCT_COMMAND_SPECIFIC,
850                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
851                 return (1);
852         }
853
854         ncq = &sc->compl_queues[qid];
855         ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
856         ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
857         if (ncq->intr_vec > (sc->max_queues + 1)) {
858                 pci_nvme_status_tc(&compl->status,
859                     NVME_SCT_COMMAND_SPECIFIC,
860                     NVME_SC_INVALID_INTERRUPT_VECTOR);
861                 return (1);
862         }
863
864         ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
865         if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
866                 /*
867                  * Queues must specify at least two entries
868                  * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
869                  * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
870                  */
871                 pci_nvme_status_tc(&compl->status,
872                     NVME_SCT_COMMAND_SPECIFIC,
873                     NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
874                 return (1);
875         }
876         ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
877                      command->prp1,
878                      sizeof(struct nvme_command) * (size_t)ncq->size);
879
880         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
881
882
883         return (1);
884 }
885
886 static int
887 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
888         struct nvme_completion* compl)
889 {
890         uint32_t logsize;
891         uint8_t logpage = command->cdw10 & 0xFF;
892
893         DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
894
895         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
896
897         /*
898          * Command specifies the number of dwords to return in fields NUMDU
899          * and NUMDL. This is a zero-based value.
900          */
901         logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
902         logsize *= sizeof(uint32_t);
903
904         switch (logpage) {
905         case NVME_LOG_ERROR:
906                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
907                     command->prp2, (uint8_t *)&sc->err_log,
908                     MIN(logsize, sizeof(sc->err_log)),
909                     NVME_COPY_TO_PRP);
910                 break;
911         case NVME_LOG_HEALTH_INFORMATION:
912                 /* TODO: present some smart info */
913                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
914                     command->prp2, (uint8_t *)&sc->health_log,
915                     MIN(logsize, sizeof(sc->health_log)),
916                     NVME_COPY_TO_PRP);
917                 break;
918         case NVME_LOG_FIRMWARE_SLOT:
919                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
920                     command->prp2, (uint8_t *)&sc->fw_log,
921                     MIN(logsize, sizeof(sc->fw_log)),
922                     NVME_COPY_TO_PRP);
923                 break;
924         default:
925                 DPRINTF("%s get log page %x command not supported",
926                         __func__, logpage);
927
928                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
929                     NVME_SC_INVALID_LOG_PAGE);
930         }
931
932         return (1);
933 }
934
935 static int
936 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
937         struct nvme_completion* compl)
938 {
939         void *dest;
940         uint16_t status;
941
942         DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
943                 command->cdw10 & 0xFF, command->nsid);
944
945         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
946
947         switch (command->cdw10 & 0xFF) {
948         case 0x00: /* return Identify Namespace data structure */
949                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
950                     command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
951                     NVME_COPY_TO_PRP);
952                 break;
953         case 0x01: /* return Identify Controller data structure */
954                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
955                     command->prp2, (uint8_t *)&sc->ctrldata,
956                     sizeof(sc->ctrldata),
957                     NVME_COPY_TO_PRP);
958                 break;
959         case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
960                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
961                                   sizeof(uint32_t) * 1024);
962                 ((uint32_t *)dest)[0] = 1;
963                 ((uint32_t *)dest)[1] = 0;
964                 break;
965         case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
966                 if (command->nsid != 1) {
967                         pci_nvme_status_genc(&status,
968                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
969                         break;
970                 }
971                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
972                                   sizeof(uint32_t) * 1024);
973                 /* All bytes after the descriptor shall be zero */
974                 bzero(dest, sizeof(uint32_t) * 1024);
975
976                 /* Return NIDT=1 (i.e. EUI64) descriptor */
977                 ((uint8_t *)dest)[0] = 1;
978                 ((uint8_t *)dest)[1] = sizeof(uint64_t);
979                 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
980                 break;
981         default:
982                 DPRINTF("%s unsupported identify command requested 0x%x",
983                          __func__, command->cdw10 & 0xFF);
984                 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
985                 return (1);
986         }
987
988         compl->status = status;
989         return (1);
990 }
991
992 static int
993 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
994         struct nvme_completion* compl)
995 {
996         uint16_t nqr;   /* Number of Queues Requested */
997
998         nqr = command->cdw11 & 0xFFFF;
999         if (nqr == 0xffff) {
1000                 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1001                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1002                 return (-1);
1003         }
1004
1005         sc->num_squeues = ONE_BASED(nqr);
1006         if (sc->num_squeues > sc->max_queues) {
1007                 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1008                                         sc->max_queues);
1009                 sc->num_squeues = sc->max_queues;
1010         }
1011
1012         nqr = (command->cdw11 >> 16) & 0xFFFF;
1013         if (nqr == 0xffff) {
1014                 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1015                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1016                 return (-1);
1017         }
1018
1019         sc->num_cqueues = ONE_BASED(nqr);
1020         if (sc->num_cqueues > sc->max_queues) {
1021                 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1022                                         sc->max_queues);
1023                 sc->num_cqueues = sc->max_queues;
1024         }
1025
1026         compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1027
1028         return (0);
1029 }
1030
1031 static int
1032 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1033         struct nvme_completion* compl)
1034 {
1035         int feature = command->cdw10 & 0xFF;
1036         uint32_t iv;
1037
1038         DPRINTF("%s feature 0x%x", __func__, feature);
1039         compl->cdw0 = 0;
1040
1041         switch (feature) {
1042         case NVME_FEAT_ARBITRATION:
1043                 DPRINTF("  arbitration 0x%x", command->cdw11);
1044                 break;
1045         case NVME_FEAT_POWER_MANAGEMENT:
1046                 DPRINTF("  power management 0x%x", command->cdw11);
1047                 break;
1048         case NVME_FEAT_LBA_RANGE_TYPE:
1049                 DPRINTF("  lba range 0x%x", command->cdw11);
1050                 break;
1051         case NVME_FEAT_TEMPERATURE_THRESHOLD:
1052                 DPRINTF("  temperature threshold 0x%x", command->cdw11);
1053                 break;
1054         case NVME_FEAT_ERROR_RECOVERY:
1055                 DPRINTF("  error recovery 0x%x", command->cdw11);
1056                 break;
1057         case NVME_FEAT_VOLATILE_WRITE_CACHE:
1058                 DPRINTF("  volatile write cache 0x%x", command->cdw11);
1059                 break;
1060         case NVME_FEAT_NUMBER_OF_QUEUES:
1061                 nvme_set_feature_queues(sc, command, compl);
1062                 break;
1063         case NVME_FEAT_INTERRUPT_COALESCING:
1064                 DPRINTF("  interrupt coalescing 0x%x", command->cdw11);
1065
1066                 /* in uS */
1067                 sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
1068
1069                 sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
1070                 break;
1071         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1072                 iv = command->cdw11 & 0xFFFF;
1073
1074                 DPRINTF("  interrupt vector configuration 0x%x",
1075                         command->cdw11);
1076
1077                 for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
1078                         if (sc->compl_queues[i].intr_vec == iv) {
1079                                 if (command->cdw11 & (1 << 16))
1080                                         sc->compl_queues[i].intr_en |=
1081                                                               NVME_CQ_INTCOAL;  
1082                                 else
1083                                         sc->compl_queues[i].intr_en &=
1084                                                              ~NVME_CQ_INTCOAL;  
1085                         }
1086                 }
1087                 break;
1088         case NVME_FEAT_WRITE_ATOMICITY:
1089                 DPRINTF("  write atomicity 0x%x", command->cdw11);
1090                 break;
1091         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1092                 DPRINTF("  async event configuration 0x%x",
1093                         command->cdw11);
1094                 sc->async_ev_config = command->cdw11;
1095                 break;
1096         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1097                 DPRINTF("  software progress marker 0x%x",
1098                         command->cdw11);
1099                 break;
1100         case 0x0C:
1101                 DPRINTF("  autonomous power state transition 0x%x",
1102                         command->cdw11);
1103                 break;
1104         default:
1105                 WPRINTF("%s invalid feature", __func__);
1106                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1107                 return (1);
1108         }
1109
1110         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1111         return (1);
1112 }
1113
1114 static int
1115 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1116         struct nvme_completion* compl)
1117 {
1118         int feature = command->cdw10 & 0xFF;
1119
1120         DPRINTF("%s feature 0x%x", __func__, feature);
1121
1122         compl->cdw0 = 0;
1123
1124         switch (feature) {
1125         case NVME_FEAT_ARBITRATION:
1126                 DPRINTF("  arbitration");
1127                 break;
1128         case NVME_FEAT_POWER_MANAGEMENT:
1129                 DPRINTF("  power management");
1130                 break;
1131         case NVME_FEAT_LBA_RANGE_TYPE:
1132                 DPRINTF("  lba range");
1133                 break;
1134         case NVME_FEAT_TEMPERATURE_THRESHOLD:
1135                 DPRINTF("  temperature threshold");
1136                 switch ((command->cdw11 >> 20) & 0x3) {
1137                 case 0:
1138                         /* Over temp threshold */
1139                         compl->cdw0 = 0xFFFF;
1140                         break;
1141                 case 1:
1142                         /* Under temp threshold */
1143                         compl->cdw0 = 0;
1144                         break;
1145                 default:
1146                         WPRINTF("  invalid threshold type select");
1147                         pci_nvme_status_genc(&compl->status,
1148                             NVME_SC_INVALID_FIELD);
1149                         return (1);
1150                 }
1151                 break;
1152         case NVME_FEAT_ERROR_RECOVERY:
1153                 DPRINTF("  error recovery");
1154                 break;
1155         case NVME_FEAT_VOLATILE_WRITE_CACHE:
1156                 DPRINTF("  volatile write cache");
1157                 break;
1158         case NVME_FEAT_NUMBER_OF_QUEUES:
1159                 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1160
1161                 DPRINTF("  number of queues (submit %u, completion %u)",
1162                         compl->cdw0 & 0xFFFF,
1163                         (compl->cdw0 >> 16) & 0xFFFF);
1164
1165                 break;
1166         case NVME_FEAT_INTERRUPT_COALESCING:
1167                 DPRINTF("  interrupt coalescing");
1168                 break;
1169         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1170                 DPRINTF("  interrupt vector configuration");
1171                 break;
1172         case NVME_FEAT_WRITE_ATOMICITY:
1173                 DPRINTF("  write atomicity");
1174                 break;
1175         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1176                 DPRINTF("  async event configuration");
1177                 sc->async_ev_config = command->cdw11;
1178                 break;
1179         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1180                 DPRINTF("  software progress marker");
1181                 break;
1182         case 0x0C:
1183                 DPRINTF("  autonomous power state transition");
1184                 break;
1185         default:
1186                 WPRINTF("%s invalid feature 0x%x", __func__, feature);
1187                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1188                 return (1);
1189         }
1190
1191         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1192         return (1);
1193 }
1194
1195 static int
1196 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1197         struct nvme_completion* compl)
1198 {
1199         DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1200                 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1201
1202         /* TODO: search for the command ID and abort it */
1203
1204         compl->cdw0 = 1;
1205         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1206         return (1);
1207 }
1208
1209 static int
1210 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1211         struct nvme_command* command, struct nvme_completion* compl)
1212 {
1213         DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1214
1215         /*
1216          * TODO: raise events when they happen based on the Set Features cmd.
1217          * These events happen async, so only set completion successful if
1218          * there is an event reflective of the request to get event.
1219          */
1220         pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1221             NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1222         return (0);
1223 }
1224
1225 static void
1226 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1227 {
1228         struct nvme_completion compl;
1229         struct nvme_command *cmd;
1230         struct nvme_submission_queue *sq;
1231         struct nvme_completion_queue *cq;
1232         uint16_t sqhead;
1233
1234         DPRINTF("%s index %u", __func__, (uint32_t)value);
1235
1236         sq = &sc->submit_queues[0];
1237         cq = &sc->compl_queues[0];
1238
1239         pthread_mutex_lock(&sq->mtx);
1240
1241         sqhead = sq->head;
1242         DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1243         
1244         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1245                 cmd = &(sq->qbase)[sqhead];
1246                 compl.status = 0;
1247
1248                 switch (cmd->opc) {
1249                 case NVME_OPC_DELETE_IO_SQ:
1250                         DPRINTF("%s command DELETE_IO_SQ", __func__);
1251                         nvme_opc_delete_io_sq(sc, cmd, &compl);
1252                         break;
1253                 case NVME_OPC_CREATE_IO_SQ:
1254                         DPRINTF("%s command CREATE_IO_SQ", __func__);
1255                         nvme_opc_create_io_sq(sc, cmd, &compl);
1256                         break;
1257                 case NVME_OPC_DELETE_IO_CQ:
1258                         DPRINTF("%s command DELETE_IO_CQ", __func__);
1259                         nvme_opc_delete_io_cq(sc, cmd, &compl);
1260                         break;
1261                 case NVME_OPC_CREATE_IO_CQ:
1262                         DPRINTF("%s command CREATE_IO_CQ", __func__);
1263                         nvme_opc_create_io_cq(sc, cmd, &compl);
1264                         break;
1265                 case NVME_OPC_GET_LOG_PAGE:
1266                         DPRINTF("%s command GET_LOG_PAGE", __func__);
1267                         nvme_opc_get_log_page(sc, cmd, &compl);
1268                         break;
1269                 case NVME_OPC_IDENTIFY:
1270                         DPRINTF("%s command IDENTIFY", __func__);
1271                         nvme_opc_identify(sc, cmd, &compl);
1272                         break;
1273                 case NVME_OPC_ABORT:
1274                         DPRINTF("%s command ABORT", __func__);
1275                         nvme_opc_abort(sc, cmd, &compl);
1276                         break;
1277                 case NVME_OPC_SET_FEATURES:
1278                         DPRINTF("%s command SET_FEATURES", __func__);
1279                         nvme_opc_set_features(sc, cmd, &compl);
1280                         break;
1281                 case NVME_OPC_GET_FEATURES:
1282                         DPRINTF("%s command GET_FEATURES", __func__);
1283                         nvme_opc_get_features(sc, cmd, &compl);
1284                         break;
1285                 case NVME_OPC_ASYNC_EVENT_REQUEST:
1286                         DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1287                         /* XXX dont care, unhandled for now
1288                         nvme_opc_async_event_req(sc, cmd, &compl);
1289                         */
1290                         compl.status = NVME_NO_STATUS;
1291                         break;
1292                 default:
1293                         WPRINTF("0x%x command is not implemented",
1294                             cmd->opc);
1295                         pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1296                 }
1297                 sqhead = (sqhead + 1) % sq->size;
1298
1299                 if (NVME_COMPLETION_VALID(compl)) {
1300                         pci_nvme_cq_update(sc, &sc->compl_queues[0],
1301                             compl.cdw0,
1302                             cmd->cid,
1303                             0,          /* SQID */
1304                             compl.status);
1305                 }
1306         }
1307
1308         DPRINTF("setting sqhead %u", sqhead);
1309         sq->head = sqhead;
1310
1311         if (cq->head != cq->tail)
1312                 pci_generate_msix(sc->nsc_pi, 0);
1313
1314         pthread_mutex_unlock(&sq->mtx);
1315 }
1316
1317 static int
1318 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1319         uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1320 {
1321         int iovidx;
1322
1323         if (req != NULL) {
1324                 /* concatenate contig block-iovs to minimize number of iovs */
1325                 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1326                         iovidx = req->io_req.br_iovcnt - 1;
1327
1328                         req->io_req.br_iov[iovidx].iov_base =
1329                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1330                                              req->prev_gpaddr, size);
1331
1332                         req->prev_size += size;
1333                         req->io_req.br_resid += size;
1334
1335                         req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1336                 } else {
1337                         pthread_mutex_lock(&req->mtx);
1338
1339                         iovidx = req->io_req.br_iovcnt;
1340                         if (iovidx == NVME_MAX_BLOCKIOVS) {
1341                                 int err = 0;
1342
1343                                 DPRINTF("large I/O, doing partial req");
1344
1345                                 iovidx = 0;
1346                                 req->io_req.br_iovcnt = 0;
1347
1348                                 req->io_req.br_callback = pci_nvme_io_partial;
1349
1350                                 if (!do_write)
1351                                         err = blockif_read(sc->nvstore.ctx,
1352                                                            &req->io_req);
1353                                 else
1354                                         err = blockif_write(sc->nvstore.ctx,
1355                                                             &req->io_req);
1356
1357                                 /* wait until req completes before cont */
1358                                 if (err == 0)
1359                                         pthread_cond_wait(&req->cv, &req->mtx);
1360                         }
1361                         if (iovidx == 0) {
1362                                 req->io_req.br_offset = lba;
1363                                 req->io_req.br_resid = 0;
1364                                 req->io_req.br_param = req;
1365                         }
1366
1367                         req->io_req.br_iov[iovidx].iov_base =
1368                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1369                                              gpaddr, size);
1370
1371                         req->io_req.br_iov[iovidx].iov_len = size;
1372
1373                         req->prev_gpaddr = gpaddr;
1374                         req->prev_size = size;
1375                         req->io_req.br_resid += size;
1376
1377                         req->io_req.br_iovcnt++;
1378
1379                         pthread_mutex_unlock(&req->mtx);
1380                 }
1381         } else {
1382                 /* RAM buffer: read/write directly */
1383                 void *p = sc->nvstore.ctx;
1384                 void *gptr;
1385
1386                 if ((lba + size) > sc->nvstore.size) {
1387                         WPRINTF("%s write would overflow RAM", __func__);
1388                         return (-1);
1389                 }
1390
1391                 p = (void *)((uintptr_t)p + (uintptr_t)lba);
1392                 gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1393                 if (do_write) 
1394                         memcpy(p, gptr, size);
1395                 else
1396                         memcpy(gptr, p, size);
1397         }
1398         return (0);
1399 }
1400
1401 static void
1402 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1403         struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1404         uint32_t cdw0, uint16_t status)
1405 {
1406         struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1407
1408         DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1409                  __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1410                  NVME_STATUS_GET_SC(status));
1411
1412         pci_nvme_cq_update(sc, cq,
1413             0,          /* CDW0 */
1414             cid,
1415             sqid,
1416             status);
1417
1418         if (cq->head != cq->tail) {
1419                 if (cq->intr_en & NVME_CQ_INTEN) {
1420                         pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1421                 } else {
1422                         DPRINTF("%s: CQ%u interrupt disabled",
1423                                                 __func__, sq->cqid);
1424                 }
1425         }
1426 }
1427
1428 static void
1429 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1430 {
1431         req->sc = NULL;
1432         req->nvme_sq = NULL;
1433         req->sqid = 0;
1434
1435         pthread_mutex_lock(&sc->mtx);
1436
1437         STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1438         sc->pending_ios--;
1439
1440         /* when no more IO pending, can set to ready if device reset/enabled */
1441         if (sc->pending_ios == 0 &&
1442             NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1443                 sc->regs.csts |= NVME_CSTS_RDY;
1444
1445         pthread_mutex_unlock(&sc->mtx);
1446
1447         sem_post(&sc->iosemlock);
1448 }
1449
1450 static struct pci_nvme_ioreq *
1451 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1452 {
1453         struct pci_nvme_ioreq *req = NULL;;
1454
1455         sem_wait(&sc->iosemlock);
1456         pthread_mutex_lock(&sc->mtx);
1457
1458         req = STAILQ_FIRST(&sc->ioreqs_free);
1459         assert(req != NULL);
1460         STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1461
1462         req->sc = sc;
1463
1464         sc->pending_ios++;
1465
1466         pthread_mutex_unlock(&sc->mtx);
1467
1468         req->io_req.br_iovcnt = 0;
1469         req->io_req.br_offset = 0;
1470         req->io_req.br_resid = 0;
1471         req->io_req.br_param = req;
1472         req->prev_gpaddr = 0;
1473         req->prev_size = 0;
1474
1475         return req;
1476 }
1477
1478 static void
1479 pci_nvme_io_done(struct blockif_req *br, int err)
1480 {
1481         struct pci_nvme_ioreq *req = br->br_param;
1482         struct nvme_submission_queue *sq = req->nvme_sq;
1483         uint16_t code, status;
1484
1485         DPRINTF("%s error %d %s", __func__, err, strerror(err));
1486
1487         /* TODO return correct error */
1488         code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1489         pci_nvme_status_genc(&status, code);
1490
1491         pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1492         pci_nvme_release_ioreq(req->sc, req);
1493 }
1494
1495 static void
1496 pci_nvme_io_partial(struct blockif_req *br, int err)
1497 {
1498         struct pci_nvme_ioreq *req = br->br_param;
1499
1500         DPRINTF("%s error %d %s", __func__, err, strerror(err));
1501
1502         pthread_cond_signal(&req->cv);
1503 }
1504
1505 /*
1506  * Implements the Flush command. The specification states:
1507  *    If a volatile write cache is not present, Flush commands complete
1508  *    successfully and have no effect
1509  * in the description of the Volatile Write Cache (VWC) field of the Identify
1510  * Controller data. Therefore, set status to Success if the command is
1511  * not supported (i.e. RAM or as indicated by the blockif).
1512  */
1513 static bool
1514 nvme_opc_flush(struct pci_nvme_softc *sc,
1515     struct nvme_command *cmd,
1516     struct pci_nvme_blockstore *nvstore,
1517     struct pci_nvme_ioreq *req,
1518     uint16_t *status)
1519 {
1520         bool pending = false;
1521
1522         if (nvstore->type == NVME_STOR_RAM) {
1523                 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1524         } else {
1525                 int err;
1526
1527                 req->io_req.br_callback = pci_nvme_io_done;
1528
1529                 err = blockif_flush(nvstore->ctx, &req->io_req);
1530                 switch (err) {
1531                 case 0:
1532                         pending = true;
1533                         break;
1534                 case EOPNOTSUPP:
1535                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1536                         break;
1537                 default:
1538                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1539                 }
1540         }
1541
1542         return (pending);
1543 }
1544
1545 static bool
1546 nvme_opc_write_read(struct pci_nvme_softc *sc,
1547     struct nvme_command *cmd,
1548     struct pci_nvme_blockstore *nvstore,
1549     struct pci_nvme_ioreq *req,
1550     uint16_t *status)
1551 {
1552         uint64_t lba, nblocks, bytes;
1553         size_t offset;
1554         bool is_write = cmd->opc == NVME_OPC_WRITE;
1555         bool pending = false;
1556
1557         lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1558         nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1559
1560         offset = lba * nvstore->sectsz;
1561         bytes  = nblocks * nvstore->sectsz;
1562
1563         if ((offset + bytes) > nvstore->size) {
1564                 WPRINTF("%s command would exceed LBA range", __func__);
1565                 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
1566                 goto out;
1567         }
1568
1569         req->io_req.br_offset = lba;
1570
1571         /* PRP bits 1:0 must be zero */
1572         cmd->prp1 &= ~0x3UL;
1573         cmd->prp2 &= ~0x3UL;
1574
1575         if (nvstore->type == NVME_STOR_RAM) {
1576                 uint8_t *buf = nvstore->ctx;
1577                 enum nvme_copy_dir dir;
1578
1579                 if (is_write)
1580                         dir = NVME_COPY_TO_PRP;
1581                 else
1582                         dir = NVME_COPY_FROM_PRP;
1583
1584                 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
1585                     buf + offset, bytes, dir))
1586                         pci_nvme_status_genc(status,
1587                             NVME_SC_DATA_TRANSFER_ERROR);
1588                 else
1589                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1590         } else {
1591                 uint64_t size;
1592                 int err;
1593
1594                 size = MIN(PAGE_SIZE - (cmd->prp1 % PAGE_SIZE), bytes);
1595                 if (pci_nvme_append_iov_req(sc, req, cmd->prp1,
1596                     size, is_write, offset)) {
1597                         pci_nvme_status_genc(status,
1598                             NVME_SC_DATA_TRANSFER_ERROR);
1599                         goto out;
1600                 }
1601
1602                 offset += size;
1603                 bytes  -= size;
1604
1605                 if (bytes == 0) {
1606                         ;
1607                 } else if (bytes <= PAGE_SIZE) {
1608                         size = bytes;
1609                         if (pci_nvme_append_iov_req(sc, req, cmd->prp2,
1610                             size, is_write, offset)) {
1611                                 pci_nvme_status_genc(status,
1612                                     NVME_SC_DATA_TRANSFER_ERROR);
1613                                 goto out;
1614                         }
1615                 } else {
1616                         void *vmctx = sc->nsc_pi->pi_vmctx;
1617                         uint64_t *prp_list = &cmd->prp2;
1618                         uint64_t *last = prp_list;
1619
1620                         /* PRP2 is pointer to a physical region page list */
1621                         while (bytes) {
1622                                 /* Last entry in list points to the next list */
1623                                 if (prp_list == last) {
1624                                         uint64_t prp = *prp_list;
1625
1626                                         prp_list = paddr_guest2host(vmctx, prp,
1627                                             PAGE_SIZE - (prp % PAGE_SIZE));
1628                                         last = prp_list + (NVME_PRP2_ITEMS - 1);
1629                                 }
1630
1631                                 size = MIN(bytes, PAGE_SIZE);
1632
1633                                 if (pci_nvme_append_iov_req(sc, req, *prp_list,
1634                                     size, is_write, offset)) {
1635                                         pci_nvme_status_genc(status,
1636                                             NVME_SC_DATA_TRANSFER_ERROR);
1637                                         goto out;
1638                                 }
1639
1640                                 offset += size;
1641                                 bytes  -= size;
1642
1643                                 prp_list++;
1644                         }
1645                 }
1646                 req->io_req.br_callback = pci_nvme_io_done;
1647                 if (is_write)
1648                         err = blockif_write(nvstore->ctx, &req->io_req);
1649                 else
1650                         err = blockif_read(nvstore->ctx, &req->io_req);
1651
1652                 if (err)
1653                         pci_nvme_status_genc(status, NVME_SC_DATA_TRANSFER_ERROR);
1654                 else
1655                         pending = true;
1656         }
1657 out:
1658         return (pending);
1659 }
1660
1661 static void
1662 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
1663 {
1664         struct pci_nvme_ioreq *req = br->br_param;
1665         struct pci_nvme_softc *sc = req->sc;
1666         bool done = true;
1667         uint16_t status;
1668
1669         if (err) {
1670                 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
1671         } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
1672                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1673         } else {
1674                 struct iovec *iov = req->io_req.br_iov;
1675
1676                 req->prev_gpaddr++;
1677                 iov += req->prev_gpaddr;
1678
1679                 /* The iov_* values already include the sector size */
1680                 req->io_req.br_offset = (off_t)iov->iov_base;
1681                 req->io_req.br_resid = iov->iov_len;
1682                 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
1683                         pci_nvme_status_genc(&status,
1684                             NVME_SC_INTERNAL_DEVICE_ERROR);
1685                 } else
1686                         done = false;
1687         }
1688
1689         if (done) {
1690                 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
1691                     req->cid, 0, status);
1692                 pci_nvme_release_ioreq(sc, req);
1693         }
1694 }
1695
1696 static bool
1697 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
1698     struct nvme_command *cmd,
1699     struct pci_nvme_blockstore *nvstore,
1700     struct pci_nvme_ioreq *req,
1701     uint16_t *status)
1702 {
1703         int err;
1704         bool pending = false;
1705
1706         if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
1707                 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
1708                 goto out;
1709         }
1710
1711         if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
1712                 struct nvme_dsm_range *range;
1713                 uint32_t nr, r;
1714                 int sectsz = sc->nvstore.sectsz;
1715
1716                 /*
1717                  * DSM calls are advisory only, and compliant controllers
1718                  * may choose to take no actions (i.e. return Success).
1719                  */
1720                 if (!nvstore->deallocate) {
1721                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1722                         goto out;
1723                 }
1724
1725                 if (req == NULL) {
1726                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1727                         goto out;
1728                 }
1729
1730                 /* copy locally because a range entry could straddle PRPs */
1731                 range = calloc(1, NVME_MAX_DSM_TRIM);
1732                 if (range == NULL) {
1733                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1734                         goto out;
1735                 }
1736                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
1737                     (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
1738
1739                 /*
1740                  * If the request is for more than a single range, store
1741                  * the ranges in the br_iov. Optimize for the common case
1742                  * of a single range.
1743                  *
1744                  * Note that NVMe Number of Ranges is a zero based value
1745                  */
1746                 nr = cmd->cdw10 & 0xff;
1747
1748                 req->io_req.br_iovcnt = 0;
1749                 req->io_req.br_offset = range[0].starting_lba * sectsz;
1750                 req->io_req.br_resid = range[0].length * sectsz;
1751
1752                 if (nr == 0) {
1753                         req->io_req.br_callback = pci_nvme_io_done;
1754                 } else {
1755                         struct iovec *iov = req->io_req.br_iov;
1756
1757                         for (r = 0; r <= nr; r++) {
1758                                 iov[r].iov_base = (void *)(range[r].starting_lba * sectsz);
1759                                 iov[r].iov_len = range[r].length * sectsz;
1760                         }
1761                         req->io_req.br_callback = pci_nvme_dealloc_sm;
1762
1763                         /*
1764                          * Use prev_gpaddr to track the current entry and
1765                          * prev_size to track the number of entries
1766                          */
1767                         req->prev_gpaddr = 0;
1768                         req->prev_size = r;
1769                 }
1770
1771                 err = blockif_delete(nvstore->ctx, &req->io_req);
1772                 if (err)
1773                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1774                 else
1775                         pending = true;
1776
1777                 free(range);
1778         }
1779 out:
1780         return (pending);
1781 }
1782
1783 static void
1784 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1785 {
1786         struct nvme_submission_queue *sq;
1787         uint16_t status;
1788         uint16_t sqhead;
1789
1790         /* handle all submissions up to sq->tail index */
1791         sq = &sc->submit_queues[idx];
1792
1793         pthread_mutex_lock(&sq->mtx);
1794
1795         sqhead = sq->head;
1796         DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
1797                  idx, sqhead, sq->tail, sq->qbase);
1798
1799         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1800                 struct nvme_command *cmd;
1801                 struct pci_nvme_ioreq *req;
1802                 uint32_t nsid;
1803                 bool pending;
1804
1805                 pending = false;
1806                 req = NULL;
1807                 status = 0;
1808
1809                 cmd = &sq->qbase[sqhead];
1810                 sqhead = (sqhead + 1) % sq->size;
1811
1812                 nsid = le32toh(cmd->nsid);
1813                 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
1814                         pci_nvme_status_genc(&status,
1815                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1816                         status |=
1817                             NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
1818                         goto complete;
1819                 }
1820
1821                 req = pci_nvme_get_ioreq(sc);
1822                 if (req == NULL) {
1823                         pci_nvme_status_genc(&status,
1824                             NVME_SC_INTERNAL_DEVICE_ERROR);
1825                         WPRINTF("%s: unable to allocate IO req", __func__);
1826                         goto complete;
1827                 }
1828                 req->nvme_sq = sq;
1829                 req->sqid = idx;
1830                 req->opc = cmd->opc;
1831                 req->cid = cmd->cid;
1832                 req->nsid = cmd->nsid;
1833
1834                 switch (cmd->opc) {
1835                 case NVME_OPC_FLUSH:
1836                         pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
1837                             req, &status);
1838                         break;
1839                 case NVME_OPC_WRITE:
1840                 case NVME_OPC_READ:
1841                         pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
1842                             req, &status);
1843                         break;
1844                 case NVME_OPC_WRITE_ZEROES:
1845                         /* TODO: write zeroes
1846                         WPRINTF("%s write zeroes lba 0x%lx blocks %u",
1847                                 __func__, lba, cmd->cdw12 & 0xFFFF); */
1848                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1849                         break;
1850                 case NVME_OPC_DATASET_MANAGEMENT:
1851                         pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
1852                             req, &status);
1853                         break;
1854                 default:
1855                         WPRINTF("%s unhandled io command 0x%x",
1856                             __func__, cmd->opc);
1857                         pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
1858                 }
1859 complete:
1860                 if (!pending) {
1861                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1862                             status);
1863                         if (req != NULL)
1864                                 pci_nvme_release_ioreq(sc, req);
1865                 }
1866         }
1867
1868         sq->head = sqhead;
1869
1870         pthread_mutex_unlock(&sq->mtx);
1871 }
1872
1873 static void
1874 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1875         uint64_t idx, int is_sq, uint64_t value)
1876 {
1877         DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
1878                 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
1879
1880         if (is_sq) {
1881                 if (idx > sc->num_squeues) {
1882                         WPRINTF("%s queue index %lu overflow from "
1883                                  "guest (max %u)",
1884                                  __func__, idx, sc->num_squeues);
1885                         return;
1886                 }
1887
1888                 atomic_store_short(&sc->submit_queues[idx].tail,
1889                                    (uint16_t)value);
1890
1891                 if (idx == 0) {
1892                         pci_nvme_handle_admin_cmd(sc, value);
1893                 } else {
1894                         /* submission queue; handle new entries in SQ */
1895                         if (idx > sc->num_squeues) {
1896                                 WPRINTF("%s SQ index %lu overflow from "
1897                                          "guest (max %u)",
1898                                          __func__, idx, sc->num_squeues);
1899                                 return;
1900                         }
1901                         pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1902                 }
1903         } else {
1904                 if (idx > sc->num_cqueues) {
1905                         WPRINTF("%s queue index %lu overflow from "
1906                                  "guest (max %u)",
1907                                  __func__, idx, sc->num_cqueues);
1908                         return;
1909                 }
1910
1911                 atomic_store_short(&sc->compl_queues[idx].head,
1912                                 (uint16_t)value);
1913         }
1914 }
1915
1916 static void
1917 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1918 {
1919         const char *s = iswrite ? "WRITE" : "READ";
1920
1921         switch (offset) {
1922         case NVME_CR_CAP_LOW:
1923                 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
1924                 break;
1925         case NVME_CR_CAP_HI:
1926                 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
1927                 break;
1928         case NVME_CR_VS:
1929                 DPRINTF("%s %s NVME_CR_VS", func, s);
1930                 break;
1931         case NVME_CR_INTMS:
1932                 DPRINTF("%s %s NVME_CR_INTMS", func, s);
1933                 break;
1934         case NVME_CR_INTMC:
1935                 DPRINTF("%s %s NVME_CR_INTMC", func, s);
1936                 break;
1937         case NVME_CR_CC:
1938                 DPRINTF("%s %s NVME_CR_CC", func, s);
1939                 break;
1940         case NVME_CR_CSTS:
1941                 DPRINTF("%s %s NVME_CR_CSTS", func, s);
1942                 break;
1943         case NVME_CR_NSSR:
1944                 DPRINTF("%s %s NVME_CR_NSSR", func, s);
1945                 break;
1946         case NVME_CR_AQA:
1947                 DPRINTF("%s %s NVME_CR_AQA", func, s);
1948                 break;
1949         case NVME_CR_ASQ_LOW:
1950                 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
1951                 break;
1952         case NVME_CR_ASQ_HI:
1953                 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
1954                 break;
1955         case NVME_CR_ACQ_LOW:
1956                 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
1957                 break;
1958         case NVME_CR_ACQ_HI:
1959                 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
1960                 break;
1961         default:
1962                 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
1963         }
1964
1965 }
1966
1967 static void
1968 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1969         uint64_t offset, int size, uint64_t value)
1970 {
1971         uint32_t ccreg;
1972
1973         if (offset >= NVME_DOORBELL_OFFSET) {
1974                 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1975                 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1976                 int is_sq = (belloffset % 8) < 4;
1977
1978                 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1979                         WPRINTF("guest attempted an overflow write offset "
1980                                  "0x%lx, val 0x%lx in %s",
1981                                  offset, value, __func__);
1982                         return;
1983                 }
1984
1985                 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1986                 return;
1987         }
1988
1989         DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
1990                 offset, size, value);
1991
1992         if (size != 4) {
1993                 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
1994                          "val 0x%lx) to bar0 in %s",
1995                          size, offset, value, __func__);
1996                 /* TODO: shutdown device */
1997                 return;
1998         }
1999
2000         pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2001
2002         pthread_mutex_lock(&sc->mtx);
2003
2004         switch (offset) {
2005         case NVME_CR_CAP_LOW:
2006         case NVME_CR_CAP_HI:
2007                 /* readonly */
2008                 break;
2009         case NVME_CR_VS:
2010                 /* readonly */
2011                 break;
2012         case NVME_CR_INTMS:
2013                 /* MSI-X, so ignore */
2014                 break;
2015         case NVME_CR_INTMC:
2016                 /* MSI-X, so ignore */
2017                 break;
2018         case NVME_CR_CC:
2019                 ccreg = (uint32_t)value;
2020
2021                 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2022                          "iocqes %u",
2023                         __func__,
2024                          NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2025                          NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2026                          NVME_CC_GET_IOCQES(ccreg));
2027
2028                 if (NVME_CC_GET_SHN(ccreg)) {
2029                         /* perform shutdown - flush out data to backend */
2030                         sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2031                             NVME_CSTS_REG_SHST_SHIFT);
2032                         sc->regs.csts |= NVME_SHST_COMPLETE <<
2033                             NVME_CSTS_REG_SHST_SHIFT;
2034                 }
2035                 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2036                         if (NVME_CC_GET_EN(ccreg) == 0)
2037                                 /* transition 1-> causes controller reset */
2038                                 pci_nvme_reset_locked(sc);
2039                         else
2040                                 pci_nvme_init_controller(ctx, sc);
2041                 }
2042
2043                 /* Insert the iocqes, iosqes and en bits from the write */
2044                 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2045                 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2046                 if (NVME_CC_GET_EN(ccreg) == 0) {
2047                         /* Insert the ams, mps and css bit fields */
2048                         sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2049                         sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2050                         sc->regs.csts &= ~NVME_CSTS_RDY;
2051                 } else if (sc->pending_ios == 0) {
2052                         sc->regs.csts |= NVME_CSTS_RDY;
2053                 }
2054                 break;
2055         case NVME_CR_CSTS:
2056                 break;
2057         case NVME_CR_NSSR:
2058                 /* ignore writes; don't support subsystem reset */
2059                 break;
2060         case NVME_CR_AQA:
2061                 sc->regs.aqa = (uint32_t)value;
2062                 break;
2063         case NVME_CR_ASQ_LOW:
2064                 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2065                                (0xFFFFF000 & value);
2066                 break;
2067         case NVME_CR_ASQ_HI:
2068                 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2069                                (value << 32);
2070                 break;
2071         case NVME_CR_ACQ_LOW:
2072                 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2073                                (0xFFFFF000 & value);
2074                 break;
2075         case NVME_CR_ACQ_HI:
2076                 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2077                                (value << 32);
2078                 break;
2079         default:
2080                 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2081                          __func__, offset, value, size);
2082         }
2083         pthread_mutex_unlock(&sc->mtx);
2084 }
2085
2086 static void
2087 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2088                 int baridx, uint64_t offset, int size, uint64_t value)
2089 {
2090         struct pci_nvme_softc* sc = pi->pi_arg;
2091
2092         if (baridx == pci_msix_table_bar(pi) ||
2093             baridx == pci_msix_pba_bar(pi)) {
2094                 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2095                          " value 0x%lx", baridx, offset, size, value);
2096
2097                 pci_emul_msix_twrite(pi, offset, size, value);
2098                 return;
2099         }
2100
2101         switch (baridx) {
2102         case 0:
2103                 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2104                 break;
2105
2106         default:
2107                 DPRINTF("%s unknown baridx %d, val 0x%lx",
2108                          __func__, baridx, value);
2109         }
2110 }
2111
2112 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2113         uint64_t offset, int size)
2114 {
2115         uint64_t value;
2116
2117         pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2118
2119         if (offset < NVME_DOORBELL_OFFSET) {
2120                 void *p = &(sc->regs);
2121                 pthread_mutex_lock(&sc->mtx);
2122                 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2123                 pthread_mutex_unlock(&sc->mtx);
2124         } else {
2125                 value = 0;
2126                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2127         }
2128
2129         switch (size) {
2130         case 1:
2131                 value &= 0xFF;
2132                 break;
2133         case 2:
2134                 value &= 0xFFFF;
2135                 break;
2136         case 4:
2137                 value &= 0xFFFFFFFF;
2138                 break;
2139         }
2140
2141         DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
2142                  offset, size, (uint32_t)value);
2143
2144         return (value);
2145 }
2146
2147
2148
2149 static uint64_t
2150 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2151     uint64_t offset, int size)
2152 {
2153         struct pci_nvme_softc* sc = pi->pi_arg;
2154
2155         if (baridx == pci_msix_table_bar(pi) ||
2156             baridx == pci_msix_pba_bar(pi)) {
2157                 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2158                         baridx, offset, size);
2159
2160                 return pci_emul_msix_tread(pi, offset, size);
2161         }
2162
2163         switch (baridx) {
2164         case 0:
2165                 return pci_nvme_read_bar_0(sc, offset, size);
2166
2167         default:
2168                 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2169         }
2170
2171         return (0);
2172 }
2173
2174
2175 static int
2176 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2177 {
2178         char bident[sizeof("XX:X:X")];
2179         char    *uopt, *xopts, *config;
2180         uint32_t sectsz;
2181         int optidx;
2182
2183         sc->max_queues = NVME_QUEUES;
2184         sc->max_qentries = NVME_MAX_QENTRIES;
2185         sc->ioslots = NVME_IOSLOTS;
2186         sc->num_squeues = sc->max_queues;
2187         sc->num_cqueues = sc->max_queues;
2188         sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2189         sectsz = 0;
2190
2191         uopt = strdup(opts);
2192         optidx = 0;
2193         snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2194                  "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2195         for (xopts = strtok(uopt, ",");
2196              xopts != NULL;
2197              xopts = strtok(NULL, ",")) {
2198
2199                 if ((config = strchr(xopts, '=')) != NULL)
2200                         *config++ = '\0';
2201
2202                 if (!strcmp("maxq", xopts)) {
2203                         sc->max_queues = atoi(config);
2204                 } else if (!strcmp("qsz", xopts)) {
2205                         sc->max_qentries = atoi(config);
2206                 } else if (!strcmp("ioslots", xopts)) {
2207                         sc->ioslots = atoi(config);
2208                 } else if (!strcmp("sectsz", xopts)) {
2209                         sectsz = atoi(config);
2210                 } else if (!strcmp("ser", xopts)) {
2211                         /*
2212                          * This field indicates the Product Serial Number in
2213                          * 7-bit ASCII, unused bytes should be space characters.
2214                          * Ref: NVMe v1.3c.
2215                          */
2216                         cpywithpad((char *)sc->ctrldata.sn,
2217                                    sizeof(sc->ctrldata.sn), config, ' ');
2218                 } else if (!strcmp("ram", xopts)) {
2219                         uint64_t sz = strtoull(&xopts[4], NULL, 10);
2220
2221                         sc->nvstore.type = NVME_STOR_RAM;
2222                         sc->nvstore.size = sz * 1024 * 1024;
2223                         sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2224                         sc->nvstore.sectsz = 4096;
2225                         sc->nvstore.sectsz_bits = 12;
2226                         if (sc->nvstore.ctx == NULL) {
2227                                 perror("Unable to allocate RAM");
2228                                 free(uopt);
2229                                 return (-1);
2230                         }
2231                 } else if (!strcmp("eui64", xopts)) {
2232                         sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2233                 } else if (!strcmp("dsm", xopts)) {
2234                         if (!strcmp("auto", config))
2235                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2236                         else if (!strcmp("enable", config))
2237                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2238                         else if (!strcmp("disable", config))
2239                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2240                 } else if (optidx == 0) {
2241                         snprintf(bident, sizeof(bident), "%d:%d",
2242                                  sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2243                         sc->nvstore.ctx = blockif_open(xopts, bident);
2244                         if (sc->nvstore.ctx == NULL) {
2245                                 perror("Could not open backing file");
2246                                 free(uopt);
2247                                 return (-1);
2248                         }
2249                         sc->nvstore.type = NVME_STOR_BLOCKIF;
2250                         sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2251                 } else {
2252                         EPRINTLN("Invalid option %s", xopts);
2253                         free(uopt);
2254                         return (-1);
2255                 }
2256
2257                 optidx++;
2258         }
2259         free(uopt);
2260
2261         if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2262                 EPRINTLN("backing store not specified");
2263                 return (-1);
2264         }
2265         if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2266                 sc->nvstore.sectsz = sectsz;
2267         else if (sc->nvstore.type != NVME_STOR_RAM)
2268                 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2269         for (sc->nvstore.sectsz_bits = 9;
2270              (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2271              sc->nvstore.sectsz_bits++);
2272
2273         if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2274                 sc->max_queues = NVME_QUEUES;
2275
2276         if (sc->max_qentries <= 0) {
2277                 EPRINTLN("Invalid qsz option");
2278                 return (-1);
2279         }
2280         if (sc->ioslots <= 0) {
2281                 EPRINTLN("Invalid ioslots option");
2282                 return (-1);
2283         }
2284
2285         return (0);
2286 }
2287
2288 static int
2289 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2290 {
2291         struct pci_nvme_softc *sc;
2292         uint32_t pci_membar_sz;
2293         int     error;
2294
2295         error = 0;
2296
2297         sc = calloc(1, sizeof(struct pci_nvme_softc));
2298         pi->pi_arg = sc;
2299         sc->nsc_pi = pi;
2300
2301         error = pci_nvme_parse_opts(sc, opts);
2302         if (error < 0)
2303                 goto done;
2304         else
2305                 error = 0;
2306
2307         STAILQ_INIT(&sc->ioreqs_free);
2308         sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2309         for (int i = 0; i < sc->ioslots; i++) {
2310                 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2311                 pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
2312                 pthread_cond_init(&sc->ioreqs[i].cv, NULL);
2313         }
2314         sc->intr_coales_aggr_thresh = 1;
2315
2316         pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2317         pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2318         pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2319         pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2320         pci_set_cfgdata8(pi, PCIR_PROGIF,
2321                          PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2322
2323         /*
2324          * Allocate size of NVMe registers + doorbell space for all queues.
2325          *
2326          * The specification requires a minimum memory I/O window size of 16K.
2327          * The Windows driver will refuse to start a device with a smaller
2328          * window.
2329          */
2330         pci_membar_sz = sizeof(struct nvme_registers) +
2331             2 * sizeof(uint32_t) * (sc->max_queues + 1);
2332         pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2333
2334         DPRINTF("nvme membar size: %u", pci_membar_sz);
2335
2336         error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2337         if (error) {
2338                 WPRINTF("%s pci alloc mem bar failed", __func__);
2339                 goto done;
2340         }
2341
2342         error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2343         if (error) {
2344                 WPRINTF("%s pci add msixcap failed", __func__);
2345                 goto done;
2346         }
2347
2348         error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2349         if (error) {
2350                 WPRINTF("%s pci add Express capability failed", __func__);
2351                 goto done;
2352         }
2353
2354         pthread_mutex_init(&sc->mtx, NULL);
2355         sem_init(&sc->iosemlock, 0, sc->ioslots);
2356
2357         pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2358         /*
2359          * Controller data depends on Namespace data so initialize Namespace
2360          * data first.
2361          */
2362         pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2363         pci_nvme_init_ctrldata(sc);
2364         pci_nvme_init_logpages(sc);
2365
2366         pci_nvme_reset(sc);
2367
2368         pci_lintr_request(pi);
2369
2370 done:
2371         return (error);
2372 }
2373
2374
2375 struct pci_devemu pci_de_nvme = {
2376         .pe_emu =       "nvme",
2377         .pe_init =      pci_nvme_init,
2378         .pe_barwrite =  pci_nvme_write,
2379         .pe_barread =   pci_nvme_read
2380 };
2381 PCI_EMUL_SET(pci_de_nvme);