2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2017 Shunsuke Mie
5 * Copyright (c) 2018 Leon Dang
6 * Copyright (c) 2020 Chuck Tuffli
8 * Function crc16 Copyright (c) 2017, Fedor Uporov
9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * bhyve PCIe-NVMe device emulation.
37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#
44 * maxq = max number of queues
45 * qsz = max elements in each queue
46 * ioslots = max number of concurrent io requests
47 * sectsz = sector size (defaults to blockif sector size)
48 * ser = serial number (20-chars max)
49 * eui64 = IEEE Extended Unique Identifier (8 byte value)
54 - create async event for smart and log
58 #include <sys/cdefs.h>
59 __FBSDID("$FreeBSD$");
61 #include <sys/errno.h>
62 #include <sys/types.h>
63 #include <net/ieee_oui.h>
67 #include <semaphore.h>
75 #include <machine/atomic.h>
76 #include <machine/vmm.h>
79 #include <dev/nvme/nvme.h>
87 static int nvme_debug = 0;
88 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
89 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
91 /* defaults; can be overridden */
92 #define NVME_MSIX_BAR 4
94 #define NVME_IOSLOTS 8
96 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
97 #define NVME_MMIO_SPACE_MIN (1 << 14)
99 #define NVME_QUEUES 16
100 #define NVME_MAX_QENTRIES 2048
102 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t))
103 #define NVME_MAX_BLOCKIOVS 512
105 /* This is a synthetic status code to indicate there is no status */
106 #define NVME_NO_STATUS 0xffff
107 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS)
111 /* Convert a zero-based value into a one-based value */
112 #define ONE_BASED(zero) ((zero) + 1)
113 /* Convert a one-based value into a zero-based value */
114 #define ZERO_BASED(one) ((one) - 1)
116 /* Encode number of SQ's and CQ's for Set/Get Features */
117 #define NVME_FEATURE_NUM_QUEUES(sc) \
118 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
119 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
121 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell)
123 enum nvme_controller_register_offsets {
124 NVME_CR_CAP_LOW = 0x00,
125 NVME_CR_CAP_HI = 0x04,
127 NVME_CR_INTMS = 0x0c,
128 NVME_CR_INTMC = 0x10,
133 NVME_CR_ASQ_LOW = 0x28,
134 NVME_CR_ASQ_HI = 0x2c,
135 NVME_CR_ACQ_LOW = 0x30,
136 NVME_CR_ACQ_HI = 0x34,
139 enum nvme_cmd_cdw11 {
140 NVME_CMD_CDW11_PC = 0x0001,
141 NVME_CMD_CDW11_IEN = 0x0002,
142 NVME_CMD_CDW11_IV = 0xFFFF0000,
150 #define NVME_CQ_INTEN 0x01
151 #define NVME_CQ_INTCOAL 0x02
153 struct nvme_completion_queue {
154 struct nvme_completion *qbase;
156 uint16_t tail; /* nvme progress */
157 uint16_t head; /* guest progress */
163 struct nvme_submission_queue {
164 struct nvme_command *qbase;
166 uint16_t head; /* nvme progress */
167 uint16_t tail; /* guest progress */
168 uint16_t cqid; /* completion queue id */
169 int busy; /* queue is being processed */
173 enum nvme_storage_type {
174 NVME_STOR_BLOCKIF = 0,
178 struct pci_nvme_blockstore {
179 enum nvme_storage_type type;
183 uint32_t sectsz_bits;
185 uint32_t deallocate:1;
188 struct pci_nvme_ioreq {
189 struct pci_nvme_softc *sc;
190 STAILQ_ENTRY(pci_nvme_ioreq) link;
191 struct nvme_submission_queue *nvme_sq;
194 /* command information */
199 uint64_t prev_gpaddr;
203 * lock if all iovs consumed (big IO);
204 * complete transaction before continuing
209 struct blockif_req io_req;
211 /* pad to fit up to 512 page descriptors from guest IO request */
212 struct iovec iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
216 /* Dataset Management bit in ONCS reflects backing storage capability */
217 NVME_DATASET_MANAGEMENT_AUTO,
218 /* Unconditionally set Dataset Management bit in ONCS */
219 NVME_DATASET_MANAGEMENT_ENABLE,
220 /* Unconditionally clear Dataset Management bit in ONCS */
221 NVME_DATASET_MANAGEMENT_DISABLE,
224 struct pci_nvme_softc {
225 struct pci_devinst *nsc_pi;
229 struct nvme_registers regs;
231 struct nvme_namespace_data nsdata;
232 struct nvme_controller_data ctrldata;
233 struct nvme_error_information_entry err_log;
234 struct nvme_health_information_page health_log;
235 struct nvme_firmware_page fw_log;
237 struct pci_nvme_blockstore nvstore;
239 uint16_t max_qentries; /* max entries per queue */
240 uint32_t max_queues; /* max number of IO SQ's or CQ's */
241 uint32_t num_cqueues;
242 uint32_t num_squeues;
244 struct pci_nvme_ioreq *ioreqs;
245 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
246 uint32_t pending_ios;
251 * Memory mapped Submission and Completion queues
252 * Each array includes both Admin and IO queues
254 struct nvme_completion_queue *compl_queues;
255 struct nvme_submission_queue *submit_queues;
257 /* controller features */
258 uint32_t intr_coales_aggr_time; /* 0x08: uS to delay intr */
259 uint32_t intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
260 uint32_t async_ev_config; /* 0x0B: async event config */
262 enum nvme_dsm_type dataset_management;
266 static void pci_nvme_io_partial(struct blockif_req *br, int err);
268 /* Controller Configuration utils */
269 #define NVME_CC_GET_EN(cc) \
270 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
271 #define NVME_CC_GET_CSS(cc) \
272 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
273 #define NVME_CC_GET_SHN(cc) \
274 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
275 #define NVME_CC_GET_IOSQES(cc) \
276 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
277 #define NVME_CC_GET_IOCQES(cc) \
278 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
280 #define NVME_CC_WRITE_MASK \
281 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
282 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
283 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
285 #define NVME_CC_NEN_WRITE_MASK \
286 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
287 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
288 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
290 /* Controller Status utils */
291 #define NVME_CSTS_GET_RDY(sts) \
292 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
294 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT)
296 /* Completion Queue status word utils */
297 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT)
298 #define NVME_STATUS_MASK \
299 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
300 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
302 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
303 NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
306 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
310 len = strnlen(src, dst_size);
311 memset(dst, pad, dst_size);
312 memcpy(dst, src, len);
316 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
319 *status &= ~NVME_STATUS_MASK;
320 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
321 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
325 pci_nvme_status_genc(uint16_t *status, uint16_t code)
328 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
332 pci_nvme_toggle_phase(uint16_t *status, int prev)
336 *status &= ~NVME_STATUS_P;
338 *status |= NVME_STATUS_P;
342 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
344 struct nvme_controller_data *cd = &sc->ctrldata;
349 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
350 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
352 /* Num of submission commands that we can handle at a time (2^rab) */
362 cd->mdts = 9; /* max data transfer size (2^mdts * CAP.MPSMIN) */
364 cd->ver = 0x00010300;
366 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
370 cd->lpa = 0; /* TODO: support some simple things like SMART */
371 cd->elpe = 0; /* max error log page entries */
372 cd->npss = 1; /* number of power states support */
374 /* Warning Composite Temperature Threshold */
377 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
378 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
379 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
380 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
381 cd->nn = 1; /* number of namespaces */
384 switch (sc->dataset_management) {
385 case NVME_DATASET_MANAGEMENT_AUTO:
386 if (sc->nvstore.deallocate)
387 cd->oncs |= NVME_ONCS_DSM;
389 case NVME_DATASET_MANAGEMENT_ENABLE:
390 cd->oncs |= NVME_ONCS_DSM;
398 cd->power_state[0].mp = 10;
402 * Calculate the CRC-16 of the given buffer
403 * See copyright attribution at top of file
406 crc16(uint16_t crc, const void *buffer, unsigned int len)
408 const unsigned char *cp = buffer;
409 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
410 static uint16_t const crc16_table[256] = {
411 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
412 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
413 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
414 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
415 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
416 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
417 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
418 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
419 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
420 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
421 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
422 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
423 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
424 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
425 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
426 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
427 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
428 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
429 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
430 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
431 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
432 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
433 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
434 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
435 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
436 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
437 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
438 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
439 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
440 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
441 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
442 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
446 crc = (((crc >> 8) & 0xffU) ^
447 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
452 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
453 struct nvme_namespace_data *nd, uint32_t nsid,
454 struct pci_nvme_blockstore *nvstore)
457 /* Get capacity and block size information from backing store */
458 nd->nsze = nvstore->size / nvstore->sectsz;
462 if (nvstore->type == NVME_STOR_BLOCKIF)
463 nvstore->deallocate = blockif_candelete(nvstore->ctx);
465 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
468 /* Create an EUI-64 if user did not provide one */
469 if (nvstore->eui64 == 0) {
471 uint64_t eui64 = nvstore->eui64;
473 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
474 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
477 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
480 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
482 be64enc(nd->eui64, nvstore->eui64);
484 /* LBA data-sz = 2^lbads */
485 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
489 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
492 memset(&sc->err_log, 0, sizeof(sc->err_log));
493 memset(&sc->health_log, 0, sizeof(sc->health_log));
494 memset(&sc->fw_log, 0, sizeof(sc->fw_log));
498 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
500 DPRINTF("%s", __func__);
502 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
503 (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
504 (60 << NVME_CAP_LO_REG_TO_SHIFT);
506 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
508 sc->regs.vs = 0x00010300; /* NVMe v1.3 */
513 sc->num_cqueues = sc->num_squeues = sc->max_queues;
514 if (sc->submit_queues != NULL) {
515 for (int i = 0; i < sc->num_squeues + 1; i++) {
517 * The Admin Submission Queue is at index 0.
518 * It must not be changed at reset otherwise the
519 * emulation will be out of sync with the guest.
522 sc->submit_queues[i].qbase = NULL;
523 sc->submit_queues[i].size = 0;
524 sc->submit_queues[i].cqid = 0;
526 sc->submit_queues[i].tail = 0;
527 sc->submit_queues[i].head = 0;
528 sc->submit_queues[i].busy = 0;
531 sc->submit_queues = calloc(sc->num_squeues + 1,
532 sizeof(struct nvme_submission_queue));
534 if (sc->compl_queues != NULL) {
535 for (int i = 0; i < sc->num_cqueues + 1; i++) {
536 /* See Admin Submission Queue note above */
538 sc->compl_queues[i].qbase = NULL;
539 sc->compl_queues[i].size = 0;
542 sc->compl_queues[i].tail = 0;
543 sc->compl_queues[i].head = 0;
546 sc->compl_queues = calloc(sc->num_cqueues + 1,
547 sizeof(struct nvme_completion_queue));
549 for (int i = 0; i < sc->num_cqueues + 1; i++)
550 pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
555 pci_nvme_reset(struct pci_nvme_softc *sc)
557 pthread_mutex_lock(&sc->mtx);
558 pci_nvme_reset_locked(sc);
559 pthread_mutex_unlock(&sc->mtx);
563 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
567 DPRINTF("%s", __func__);
569 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
570 sc->submit_queues[0].size = asqs;
571 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
572 sizeof(struct nvme_command) * asqs);
574 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
575 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
577 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
578 NVME_AQA_REG_ACQS_MASK) + 1;
579 sc->compl_queues[0].size = acqs;
580 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
581 sizeof(struct nvme_completion) * acqs);
582 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
583 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
587 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
588 size_t len, enum nvme_copy_dir dir)
593 if (len > (8 * 1024)) {
597 /* Copy from the start of prp1 to the end of the physical page */
598 bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
599 bytes = MIN(bytes, len);
601 p = vm_map_gpa(ctx, prp1, bytes);
606 if (dir == NVME_COPY_TO_PRP)
618 len = MIN(len, PAGE_SIZE);
620 p = vm_map_gpa(ctx, prp2, len);
625 if (dir == NVME_COPY_TO_PRP)
634 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
635 struct nvme_completion* compl)
637 uint16_t qid = command->cdw10 & 0xffff;
639 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
640 if (qid == 0 || qid > sc->num_squeues) {
641 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
642 __func__, qid, sc->num_squeues);
643 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
644 NVME_SC_INVALID_QUEUE_IDENTIFIER);
648 sc->submit_queues[qid].qbase = NULL;
649 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
654 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
655 struct nvme_completion* compl)
657 if (command->cdw11 & NVME_CMD_CDW11_PC) {
658 uint16_t qid = command->cdw10 & 0xffff;
659 struct nvme_submission_queue *nsq;
661 if ((qid == 0) || (qid > sc->num_squeues)) {
662 WPRINTF("%s queue index %u > num_squeues %u",
663 __func__, qid, sc->num_squeues);
664 pci_nvme_status_tc(&compl->status,
665 NVME_SCT_COMMAND_SPECIFIC,
666 NVME_SC_INVALID_QUEUE_IDENTIFIER);
670 nsq = &sc->submit_queues[qid];
671 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
673 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
674 sizeof(struct nvme_command) * (size_t)nsq->size);
675 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
676 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
678 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
679 qid, nsq->size, nsq->qbase, nsq->cqid);
681 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
683 DPRINTF("%s completed creating IOSQ qid %u",
687 * Guest sent non-cont submission queue request.
688 * This setting is unsupported by this emulation.
690 WPRINTF("%s unsupported non-contig (list-based) "
691 "create i/o submission queue", __func__);
693 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
699 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
700 struct nvme_completion* compl)
702 uint16_t qid = command->cdw10 & 0xffff;
704 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
705 if (qid == 0 || qid > sc->num_cqueues) {
706 WPRINTF("%s queue index %u / num_cqueues %u",
707 __func__, qid, sc->num_cqueues);
708 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
709 NVME_SC_INVALID_QUEUE_IDENTIFIER);
713 sc->compl_queues[qid].qbase = NULL;
714 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
719 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
720 struct nvme_completion* compl)
722 if (command->cdw11 & NVME_CMD_CDW11_PC) {
723 uint16_t qid = command->cdw10 & 0xffff;
724 struct nvme_completion_queue *ncq;
726 if ((qid == 0) || (qid > sc->num_cqueues)) {
727 WPRINTF("%s queue index %u > num_cqueues %u",
728 __func__, qid, sc->num_cqueues);
729 pci_nvme_status_tc(&compl->status,
730 NVME_SCT_COMMAND_SPECIFIC,
731 NVME_SC_INVALID_QUEUE_IDENTIFIER);
735 ncq = &sc->compl_queues[qid];
736 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
737 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
738 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
740 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
742 sizeof(struct nvme_command) * (size_t)ncq->size);
744 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
747 * Non-contig completion queue unsupported.
749 WPRINTF("%s unsupported non-contig (list-based) "
750 "create i/o completion queue",
753 /* 0x12 = Invalid Use of Controller Memory Buffer */
754 pci_nvme_status_genc(&compl->status, 0x12);
761 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
762 struct nvme_completion* compl)
764 uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
765 uint8_t logpage = command->cdw10 & 0xFF;
767 DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
769 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
773 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
774 command->prp2, (uint8_t *)&sc->err_log, logsize,
777 case NVME_LOG_HEALTH_INFORMATION:
778 /* TODO: present some smart info */
779 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
780 command->prp2, (uint8_t *)&sc->health_log, logsize,
783 case NVME_LOG_FIRMWARE_SLOT:
784 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
785 command->prp2, (uint8_t *)&sc->fw_log, logsize,
789 WPRINTF("%s get log page %x command not supported",
792 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
793 NVME_SC_INVALID_LOG_PAGE);
800 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
801 struct nvme_completion* compl)
805 DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
806 command->cdw10 & 0xFF, command->nsid);
808 switch (command->cdw10 & 0xFF) {
809 case 0x00: /* return Identify Namespace data structure */
810 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
811 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
814 case 0x01: /* return Identify Controller data structure */
815 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
816 command->prp2, (uint8_t *)&sc->ctrldata,
817 sizeof(sc->ctrldata),
820 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
821 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
822 sizeof(uint32_t) * 1024);
823 ((uint32_t *)dest)[0] = 1;
824 ((uint32_t *)dest)[1] = 0;
827 pci_nvme_status_genc(&compl->status,
828 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
830 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
837 DPRINTF("%s unsupported identify command requested 0x%x",
838 __func__, command->cdw10 & 0xFF);
839 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
843 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
848 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
849 struct nvme_completion* compl)
851 uint16_t nqr; /* Number of Queues Requested */
853 nqr = command->cdw11 & 0xFFFF;
855 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
856 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
860 sc->num_squeues = ONE_BASED(nqr);
861 if (sc->num_squeues > sc->max_queues) {
862 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
864 sc->num_squeues = sc->max_queues;
867 nqr = (command->cdw11 >> 16) & 0xFFFF;
869 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
870 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
874 sc->num_cqueues = ONE_BASED(nqr);
875 if (sc->num_cqueues > sc->max_queues) {
876 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
878 sc->num_cqueues = sc->max_queues;
881 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
887 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
888 struct nvme_completion* compl)
890 int feature = command->cdw10 & 0xFF;
893 DPRINTF("%s feature 0x%x", __func__, feature);
897 case NVME_FEAT_ARBITRATION:
898 DPRINTF(" arbitration 0x%x", command->cdw11);
900 case NVME_FEAT_POWER_MANAGEMENT:
901 DPRINTF(" power management 0x%x", command->cdw11);
903 case NVME_FEAT_LBA_RANGE_TYPE:
904 DPRINTF(" lba range 0x%x", command->cdw11);
906 case NVME_FEAT_TEMPERATURE_THRESHOLD:
907 DPRINTF(" temperature threshold 0x%x", command->cdw11);
909 case NVME_FEAT_ERROR_RECOVERY:
910 DPRINTF(" error recovery 0x%x", command->cdw11);
912 case NVME_FEAT_VOLATILE_WRITE_CACHE:
913 DPRINTF(" volatile write cache 0x%x", command->cdw11);
915 case NVME_FEAT_NUMBER_OF_QUEUES:
916 nvme_set_feature_queues(sc, command, compl);
918 case NVME_FEAT_INTERRUPT_COALESCING:
919 DPRINTF(" interrupt coalescing 0x%x", command->cdw11);
922 sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
924 sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
926 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
927 iv = command->cdw11 & 0xFFFF;
929 DPRINTF(" interrupt vector configuration 0x%x",
932 for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
933 if (sc->compl_queues[i].intr_vec == iv) {
934 if (command->cdw11 & (1 << 16))
935 sc->compl_queues[i].intr_en |=
938 sc->compl_queues[i].intr_en &=
943 case NVME_FEAT_WRITE_ATOMICITY:
944 DPRINTF(" write atomicity 0x%x", command->cdw11);
946 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
947 DPRINTF(" async event configuration 0x%x",
949 sc->async_ev_config = command->cdw11;
951 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
952 DPRINTF(" software progress marker 0x%x",
956 DPRINTF(" autonomous power state transition 0x%x",
960 WPRINTF("%s invalid feature", __func__);
961 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
965 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
970 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
971 struct nvme_completion* compl)
973 int feature = command->cdw10 & 0xFF;
975 DPRINTF("%s feature 0x%x", __func__, feature);
980 case NVME_FEAT_ARBITRATION:
981 DPRINTF(" arbitration");
983 case NVME_FEAT_POWER_MANAGEMENT:
984 DPRINTF(" power management");
986 case NVME_FEAT_LBA_RANGE_TYPE:
987 DPRINTF(" lba range");
989 case NVME_FEAT_TEMPERATURE_THRESHOLD:
990 DPRINTF(" temperature threshold");
991 switch ((command->cdw11 >> 20) & 0x3) {
993 /* Over temp threshold */
994 compl->cdw0 = 0xFFFF;
997 /* Under temp threshold */
1001 WPRINTF(" invalid threshold type select");
1002 pci_nvme_status_genc(&compl->status,
1003 NVME_SC_INVALID_FIELD);
1007 case NVME_FEAT_ERROR_RECOVERY:
1008 DPRINTF(" error recovery");
1010 case NVME_FEAT_VOLATILE_WRITE_CACHE:
1011 DPRINTF(" volatile write cache");
1013 case NVME_FEAT_NUMBER_OF_QUEUES:
1014 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1016 DPRINTF(" number of queues (submit %u, completion %u)",
1017 compl->cdw0 & 0xFFFF,
1018 (compl->cdw0 >> 16) & 0xFFFF);
1021 case NVME_FEAT_INTERRUPT_COALESCING:
1022 DPRINTF(" interrupt coalescing");
1024 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1025 DPRINTF(" interrupt vector configuration");
1027 case NVME_FEAT_WRITE_ATOMICITY:
1028 DPRINTF(" write atomicity");
1030 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1031 DPRINTF(" async event configuration");
1032 sc->async_ev_config = command->cdw11;
1034 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1035 DPRINTF(" software progress marker");
1038 DPRINTF(" autonomous power state transition");
1041 WPRINTF("%s invalid feature 0x%x", __func__, feature);
1042 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1046 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1051 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1052 struct nvme_completion* compl)
1054 DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1055 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1057 /* TODO: search for the command ID and abort it */
1060 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1065 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1066 struct nvme_command* command, struct nvme_completion* compl)
1068 DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1071 * TODO: raise events when they happen based on the Set Features cmd.
1072 * These events happen async, so only set completion successful if
1073 * there is an event reflective of the request to get event.
1075 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1076 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1081 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1083 struct nvme_completion compl;
1084 struct nvme_command *cmd;
1085 struct nvme_submission_queue *sq;
1086 struct nvme_completion_queue *cq;
1089 DPRINTF("%s index %u", __func__, (uint32_t)value);
1091 sq = &sc->submit_queues[0];
1092 cq = &sc->compl_queues[0];
1094 sqhead = atomic_load_acq_short(&sq->head);
1096 if (atomic_testandset_int(&sq->busy, 1)) {
1097 DPRINTF("%s SQ busy, head %u, tail %u",
1098 __func__, sqhead, sq->tail);
1102 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1104 while (sqhead != atomic_load_acq_short(&sq->tail)) {
1105 cmd = &(sq->qbase)[sqhead];
1109 case NVME_OPC_DELETE_IO_SQ:
1110 DPRINTF("%s command DELETE_IO_SQ", __func__);
1111 nvme_opc_delete_io_sq(sc, cmd, &compl);
1113 case NVME_OPC_CREATE_IO_SQ:
1114 DPRINTF("%s command CREATE_IO_SQ", __func__);
1115 nvme_opc_create_io_sq(sc, cmd, &compl);
1117 case NVME_OPC_DELETE_IO_CQ:
1118 DPRINTF("%s command DELETE_IO_CQ", __func__);
1119 nvme_opc_delete_io_cq(sc, cmd, &compl);
1121 case NVME_OPC_CREATE_IO_CQ:
1122 DPRINTF("%s command CREATE_IO_CQ", __func__);
1123 nvme_opc_create_io_cq(sc, cmd, &compl);
1125 case NVME_OPC_GET_LOG_PAGE:
1126 DPRINTF("%s command GET_LOG_PAGE", __func__);
1127 nvme_opc_get_log_page(sc, cmd, &compl);
1129 case NVME_OPC_IDENTIFY:
1130 DPRINTF("%s command IDENTIFY", __func__);
1131 nvme_opc_identify(sc, cmd, &compl);
1133 case NVME_OPC_ABORT:
1134 DPRINTF("%s command ABORT", __func__);
1135 nvme_opc_abort(sc, cmd, &compl);
1137 case NVME_OPC_SET_FEATURES:
1138 DPRINTF("%s command SET_FEATURES", __func__);
1139 nvme_opc_set_features(sc, cmd, &compl);
1141 case NVME_OPC_GET_FEATURES:
1142 DPRINTF("%s command GET_FEATURES", __func__);
1143 nvme_opc_get_features(sc, cmd, &compl);
1145 case NVME_OPC_ASYNC_EVENT_REQUEST:
1146 DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1147 /* XXX dont care, unhandled for now
1148 nvme_opc_async_event_req(sc, cmd, &compl);
1150 compl.status = NVME_NO_STATUS;
1153 WPRINTF("0x%x command is not implemented",
1155 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1157 sqhead = (sqhead + 1) % sq->size;
1159 if (NVME_COMPLETION_VALID(compl)) {
1160 struct nvme_completion *cp;
1163 cp = &(cq->qbase)[cq->tail];
1164 cp->cdw0 = compl.cdw0;
1169 phase = NVME_STATUS_GET_P(cp->status);
1170 cp->status = compl.status;
1171 pci_nvme_toggle_phase(&cp->status, phase);
1173 cq->tail = (cq->tail + 1) % cq->size;
1177 DPRINTF("setting sqhead %u", sqhead);
1178 atomic_store_short(&sq->head, sqhead);
1179 atomic_store_int(&sq->busy, 0);
1181 if (cq->head != cq->tail)
1182 pci_generate_msix(sc->nsc_pi, 0);
1187 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1188 uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1193 /* concatenate contig block-iovs to minimize number of iovs */
1194 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1195 iovidx = req->io_req.br_iovcnt - 1;
1197 req->io_req.br_iov[iovidx].iov_base =
1198 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1199 req->prev_gpaddr, size);
1201 req->prev_size += size;
1202 req->io_req.br_resid += size;
1204 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1206 pthread_mutex_lock(&req->mtx);
1208 iovidx = req->io_req.br_iovcnt;
1209 if (iovidx == NVME_MAX_BLOCKIOVS) {
1212 DPRINTF("large I/O, doing partial req");
1215 req->io_req.br_iovcnt = 0;
1217 req->io_req.br_callback = pci_nvme_io_partial;
1220 err = blockif_read(sc->nvstore.ctx,
1223 err = blockif_write(sc->nvstore.ctx,
1226 /* wait until req completes before cont */
1228 pthread_cond_wait(&req->cv, &req->mtx);
1231 req->io_req.br_offset = lba;
1232 req->io_req.br_resid = 0;
1233 req->io_req.br_param = req;
1236 req->io_req.br_iov[iovidx].iov_base =
1237 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1240 req->io_req.br_iov[iovidx].iov_len = size;
1242 req->prev_gpaddr = gpaddr;
1243 req->prev_size = size;
1244 req->io_req.br_resid += size;
1246 req->io_req.br_iovcnt++;
1248 pthread_mutex_unlock(&req->mtx);
1251 /* RAM buffer: read/write directly */
1252 void *p = sc->nvstore.ctx;
1255 if ((lba + size) > sc->nvstore.size) {
1256 WPRINTF("%s write would overflow RAM", __func__);
1260 p = (void *)((uintptr_t)p + (uintptr_t)lba);
1261 gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1263 memcpy(p, gptr, size);
1265 memcpy(gptr, p, size);
1271 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1272 struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1273 uint32_t cdw0, uint16_t status, int ignore_busy)
1275 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1276 struct nvme_completion *compl;
1279 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1280 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1281 NVME_STATUS_GET_SC(status));
1283 pthread_mutex_lock(&cq->mtx);
1285 assert(cq->qbase != NULL);
1287 compl = &cq->qbase[cq->tail];
1291 compl->sqhd = atomic_load_acq_short(&sq->head);
1295 phase = NVME_STATUS_GET_P(compl->status);
1296 compl->status = status;
1297 pci_nvme_toggle_phase(&compl->status, phase);
1299 cq->tail = (cq->tail + 1) % cq->size;
1301 pthread_mutex_unlock(&cq->mtx);
1303 if (cq->head != cq->tail) {
1304 if (cq->intr_en & NVME_CQ_INTEN) {
1305 pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1307 DPRINTF("%s: CQ%u interrupt disabled",
1308 __func__, sq->cqid);
1314 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1317 req->nvme_sq = NULL;
1320 pthread_mutex_lock(&sc->mtx);
1322 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1325 /* when no more IO pending, can set to ready if device reset/enabled */
1326 if (sc->pending_ios == 0 &&
1327 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1328 sc->regs.csts |= NVME_CSTS_RDY;
1330 pthread_mutex_unlock(&sc->mtx);
1332 sem_post(&sc->iosemlock);
1335 static struct pci_nvme_ioreq *
1336 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1338 struct pci_nvme_ioreq *req = NULL;;
1340 sem_wait(&sc->iosemlock);
1341 pthread_mutex_lock(&sc->mtx);
1343 req = STAILQ_FIRST(&sc->ioreqs_free);
1344 assert(req != NULL);
1345 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1351 pthread_mutex_unlock(&sc->mtx);
1353 req->io_req.br_iovcnt = 0;
1354 req->io_req.br_offset = 0;
1355 req->io_req.br_resid = 0;
1356 req->io_req.br_param = req;
1357 req->prev_gpaddr = 0;
1364 pci_nvme_io_done(struct blockif_req *br, int err)
1366 struct pci_nvme_ioreq *req = br->br_param;
1367 struct nvme_submission_queue *sq = req->nvme_sq;
1368 uint16_t code, status;
1370 DPRINTF("%s error %d %s", __func__, err, strerror(err));
1372 /* TODO return correct error */
1373 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1374 pci_nvme_status_genc(&status, code);
1376 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
1377 pci_nvme_release_ioreq(req->sc, req);
1381 pci_nvme_io_partial(struct blockif_req *br, int err)
1383 struct pci_nvme_ioreq *req = br->br_param;
1385 DPRINTF("%s error %d %s", __func__, err, strerror(err));
1387 pthread_cond_signal(&req->cv);
1391 * Implements the Flush command. The specification states:
1392 * If a volatile write cache is not present, Flush commands complete
1393 * successfully and have no effect
1394 * in the description of the Volatile Write Cache (VWC) field of the Identify
1395 * Controller data. Therefore, set status to Success if the command is
1396 * not supported (i.e. RAM or as indicated by the blockif).
1399 nvme_opc_flush(struct pci_nvme_softc *sc,
1400 struct nvme_command *cmd,
1401 struct pci_nvme_blockstore *nvstore,
1402 struct pci_nvme_ioreq *req,
1405 bool pending = false;
1407 if (nvstore->type == NVME_STOR_RAM) {
1408 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1412 req->io_req.br_callback = pci_nvme_io_done;
1414 err = blockif_flush(nvstore->ctx, &req->io_req);
1420 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1423 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1431 nvme_opc_write_read(struct pci_nvme_softc *sc,
1432 struct nvme_command *cmd,
1433 struct pci_nvme_blockstore *nvstore,
1434 struct pci_nvme_ioreq *req,
1437 uint64_t lba, nblocks, bytes;
1439 bool is_write = cmd->opc == NVME_OPC_WRITE;
1440 bool pending = false;
1442 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1443 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1445 offset = lba * nvstore->sectsz;
1446 bytes = nblocks * nvstore->sectsz;
1448 if ((offset + bytes) > nvstore->size) {
1449 WPRINTF("%s command would exceed LBA range", __func__);
1450 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
1454 req->io_req.br_offset = lba;
1456 /* PRP bits 1:0 must be zero */
1457 cmd->prp1 &= ~0x3UL;
1458 cmd->prp2 &= ~0x3UL;
1460 if (nvstore->type == NVME_STOR_RAM) {
1461 uint8_t *buf = nvstore->ctx;
1462 enum nvme_copy_dir dir;
1465 dir = NVME_COPY_TO_PRP;
1467 dir = NVME_COPY_FROM_PRP;
1469 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
1470 buf + offset, bytes, dir))
1471 pci_nvme_status_genc(status,
1472 NVME_SC_DATA_TRANSFER_ERROR);
1474 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1479 size = MIN(PAGE_SIZE - (cmd->prp1 % PAGE_SIZE), bytes);
1480 if (pci_nvme_append_iov_req(sc, req, cmd->prp1,
1481 size, is_write, offset)) {
1482 pci_nvme_status_genc(status,
1483 NVME_SC_DATA_TRANSFER_ERROR);
1492 } else if (bytes <= PAGE_SIZE) {
1494 if (pci_nvme_append_iov_req(sc, req, cmd->prp2,
1495 size, is_write, offset)) {
1496 pci_nvme_status_genc(status,
1497 NVME_SC_DATA_TRANSFER_ERROR);
1501 void *vmctx = sc->nsc_pi->pi_vmctx;
1502 uint64_t *prp_list = &cmd->prp2;
1503 uint64_t *last = prp_list;
1505 /* PRP2 is pointer to a physical region page list */
1507 /* Last entry in list points to the next list */
1508 if (prp_list == last) {
1509 uint64_t prp = *prp_list;
1511 prp_list = paddr_guest2host(vmctx, prp,
1512 PAGE_SIZE - (prp % PAGE_SIZE));
1513 last = prp_list + (NVME_PRP2_ITEMS - 1);
1516 size = MIN(bytes, PAGE_SIZE);
1518 if (pci_nvme_append_iov_req(sc, req, *prp_list,
1519 size, is_write, offset)) {
1520 pci_nvme_status_genc(status,
1521 NVME_SC_DATA_TRANSFER_ERROR);
1531 req->io_req.br_callback = pci_nvme_io_done;
1533 err = blockif_write(nvstore->ctx, &req->io_req);
1535 err = blockif_read(nvstore->ctx, &req->io_req);
1538 pci_nvme_status_genc(status, NVME_SC_DATA_TRANSFER_ERROR);
1547 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
1549 struct pci_nvme_ioreq *req = br->br_param;
1550 struct pci_nvme_softc *sc = req->sc;
1555 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
1556 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
1557 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1559 struct iovec *iov = req->io_req.br_iov;
1562 iov += req->prev_gpaddr;
1564 /* The iov_* values already include the sector size */
1565 req->io_req.br_offset = (off_t)iov->iov_base;
1566 req->io_req.br_resid = iov->iov_len;
1567 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
1568 pci_nvme_status_genc(&status,
1569 NVME_SC_INTERNAL_DEVICE_ERROR);
1575 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
1576 req->cid, 0, status, 0);
1577 pci_nvme_release_ioreq(sc, req);
1582 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
1583 struct nvme_command *cmd,
1584 struct pci_nvme_blockstore *nvstore,
1585 struct pci_nvme_ioreq *req,
1589 bool pending = false;
1591 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
1592 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
1596 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
1597 struct nvme_dsm_range *range;
1599 int sectsz = sc->nvstore.sectsz;
1602 * DSM calls are advisory only, and compliant controllers
1603 * may choose to take no actions (i.e. return Success).
1605 if (!nvstore->deallocate) {
1606 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1611 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1615 /* copy locally because a range entry could straddle PRPs */
1616 range = calloc(1, NVME_MAX_DSM_TRIM);
1617 if (range == NULL) {
1618 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1621 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
1622 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
1625 * If the request is for more than a single range, store
1626 * the ranges in the br_iov. Optimize for the common case
1627 * of a single range.
1629 * Note that NVMe Number of Ranges is a zero based value
1631 nr = cmd->cdw10 & 0xff;
1633 req->io_req.br_iovcnt = 0;
1634 req->io_req.br_offset = range[0].starting_lba * sectsz;
1635 req->io_req.br_resid = range[0].length * sectsz;
1638 req->io_req.br_callback = pci_nvme_io_done;
1640 struct iovec *iov = req->io_req.br_iov;
1642 for (r = 0; r <= nr; r++) {
1643 iov[r].iov_base = (void *)(range[r].starting_lba * sectsz);
1644 iov[r].iov_len = range[r].length * sectsz;
1646 req->io_req.br_callback = pci_nvme_dealloc_sm;
1649 * Use prev_gpaddr to track the current entry and
1650 * prev_size to track the number of entries
1652 req->prev_gpaddr = 0;
1656 err = blockif_delete(nvstore->ctx, &req->io_req);
1658 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1669 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1671 struct nvme_submission_queue *sq;
1675 /* handle all submissions up to sq->tail index */
1676 sq = &sc->submit_queues[idx];
1678 if (atomic_testandset_int(&sq->busy, 1)) {
1679 DPRINTF("%s sqid %u busy", __func__, idx);
1683 sqhead = atomic_load_acq_short(&sq->head);
1685 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
1686 idx, sqhead, sq->tail, sq->qbase);
1688 while (sqhead != atomic_load_acq_short(&sq->tail)) {
1689 struct nvme_command *cmd;
1690 struct pci_nvme_ioreq *req;
1698 cmd = &sq->qbase[sqhead];
1699 sqhead = (sqhead + 1) % sq->size;
1701 nsid = le32toh(cmd->nsid);
1702 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
1703 pci_nvme_status_genc(&status,
1704 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1706 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
1710 req = pci_nvme_get_ioreq(sc);
1712 pci_nvme_status_genc(&status,
1713 NVME_SC_INTERNAL_DEVICE_ERROR);
1714 WPRINTF("%s: unable to allocate IO req", __func__);
1719 req->opc = cmd->opc;
1720 req->cid = cmd->cid;
1721 req->nsid = cmd->nsid;
1724 case NVME_OPC_FLUSH:
1725 pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
1728 case NVME_OPC_WRITE:
1730 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
1733 case NVME_OPC_WRITE_ZEROES:
1734 /* TODO: write zeroes
1735 WPRINTF("%s write zeroes lba 0x%lx blocks %u",
1736 __func__, lba, cmd->cdw12 & 0xFFFF); */
1737 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1739 case NVME_OPC_DATASET_MANAGEMENT:
1740 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
1744 WPRINTF("%s unhandled io command 0x%x",
1745 __func__, cmd->opc);
1746 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
1750 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1753 pci_nvme_release_ioreq(sc, req);
1757 atomic_store_short(&sq->head, sqhead);
1758 atomic_store_int(&sq->busy, 0);
1762 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1763 uint64_t idx, int is_sq, uint64_t value)
1765 DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
1766 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
1769 atomic_store_short(&sc->submit_queues[idx].tail,
1773 pci_nvme_handle_admin_cmd(sc, value);
1775 /* submission queue; handle new entries in SQ */
1776 if (idx > sc->num_squeues) {
1777 WPRINTF("%s SQ index %lu overflow from "
1779 __func__, idx, sc->num_squeues);
1782 pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1785 if (idx > sc->num_cqueues) {
1786 WPRINTF("%s queue index %lu overflow from "
1788 __func__, idx, sc->num_cqueues);
1792 sc->compl_queues[idx].head = (uint16_t)value;
1797 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1799 const char *s = iswrite ? "WRITE" : "READ";
1802 case NVME_CR_CAP_LOW:
1803 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
1805 case NVME_CR_CAP_HI:
1806 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
1809 DPRINTF("%s %s NVME_CR_VS", func, s);
1812 DPRINTF("%s %s NVME_CR_INTMS", func, s);
1815 DPRINTF("%s %s NVME_CR_INTMC", func, s);
1818 DPRINTF("%s %s NVME_CR_CC", func, s);
1821 DPRINTF("%s %s NVME_CR_CSTS", func, s);
1824 DPRINTF("%s %s NVME_CR_NSSR", func, s);
1827 DPRINTF("%s %s NVME_CR_AQA", func, s);
1829 case NVME_CR_ASQ_LOW:
1830 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
1832 case NVME_CR_ASQ_HI:
1833 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
1835 case NVME_CR_ACQ_LOW:
1836 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
1838 case NVME_CR_ACQ_HI:
1839 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
1842 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
1848 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1849 uint64_t offset, int size, uint64_t value)
1853 if (offset >= NVME_DOORBELL_OFFSET) {
1854 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1855 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1856 int is_sq = (belloffset % 8) < 4;
1858 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1859 WPRINTF("guest attempted an overflow write offset "
1860 "0x%lx, val 0x%lx in %s",
1861 offset, value, __func__);
1865 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1869 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
1870 offset, size, value);
1873 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
1874 "val 0x%lx) to bar0 in %s",
1875 size, offset, value, __func__);
1876 /* TODO: shutdown device */
1880 pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1882 pthread_mutex_lock(&sc->mtx);
1885 case NVME_CR_CAP_LOW:
1886 case NVME_CR_CAP_HI:
1893 /* MSI-X, so ignore */
1896 /* MSI-X, so ignore */
1899 ccreg = (uint32_t)value;
1901 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1904 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1905 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1906 NVME_CC_GET_IOCQES(ccreg));
1908 if (NVME_CC_GET_SHN(ccreg)) {
1909 /* perform shutdown - flush out data to backend */
1910 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1911 NVME_CSTS_REG_SHST_SHIFT);
1912 sc->regs.csts |= NVME_SHST_COMPLETE <<
1913 NVME_CSTS_REG_SHST_SHIFT;
1915 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1916 if (NVME_CC_GET_EN(ccreg) == 0)
1917 /* transition 1-> causes controller reset */
1918 pci_nvme_reset_locked(sc);
1920 pci_nvme_init_controller(ctx, sc);
1923 /* Insert the iocqes, iosqes and en bits from the write */
1924 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1925 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1926 if (NVME_CC_GET_EN(ccreg) == 0) {
1927 /* Insert the ams, mps and css bit fields */
1928 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1929 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1930 sc->regs.csts &= ~NVME_CSTS_RDY;
1931 } else if (sc->pending_ios == 0) {
1932 sc->regs.csts |= NVME_CSTS_RDY;
1938 /* ignore writes; don't support subsystem reset */
1941 sc->regs.aqa = (uint32_t)value;
1943 case NVME_CR_ASQ_LOW:
1944 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
1945 (0xFFFFF000 & value);
1947 case NVME_CR_ASQ_HI:
1948 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
1951 case NVME_CR_ACQ_LOW:
1952 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
1953 (0xFFFFF000 & value);
1955 case NVME_CR_ACQ_HI:
1956 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
1960 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
1961 __func__, offset, value, size);
1963 pthread_mutex_unlock(&sc->mtx);
1967 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
1968 int baridx, uint64_t offset, int size, uint64_t value)
1970 struct pci_nvme_softc* sc = pi->pi_arg;
1972 if (baridx == pci_msix_table_bar(pi) ||
1973 baridx == pci_msix_pba_bar(pi)) {
1974 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
1975 " value 0x%lx", baridx, offset, size, value);
1977 pci_emul_msix_twrite(pi, offset, size, value);
1983 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
1987 DPRINTF("%s unknown baridx %d, val 0x%lx",
1988 __func__, baridx, value);
1992 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
1993 uint64_t offset, int size)
1997 pci_nvme_bar0_reg_dumps(__func__, offset, 0);
1999 if (offset < NVME_DOORBELL_OFFSET) {
2000 void *p = &(sc->regs);
2001 pthread_mutex_lock(&sc->mtx);
2002 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2003 pthread_mutex_unlock(&sc->mtx);
2006 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2017 value &= 0xFFFFFFFF;
2021 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x",
2022 offset, size, (uint32_t)value);
2030 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2031 uint64_t offset, int size)
2033 struct pci_nvme_softc* sc = pi->pi_arg;
2035 if (baridx == pci_msix_table_bar(pi) ||
2036 baridx == pci_msix_pba_bar(pi)) {
2037 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2038 baridx, offset, size);
2040 return pci_emul_msix_tread(pi, offset, size);
2045 return pci_nvme_read_bar_0(sc, offset, size);
2048 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2056 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2058 char bident[sizeof("XX:X:X")];
2059 char *uopt, *xopts, *config;
2063 sc->max_queues = NVME_QUEUES;
2064 sc->max_qentries = NVME_MAX_QENTRIES;
2065 sc->ioslots = NVME_IOSLOTS;
2066 sc->num_squeues = sc->max_queues;
2067 sc->num_cqueues = sc->max_queues;
2068 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2071 uopt = strdup(opts);
2073 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2074 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2075 for (xopts = strtok(uopt, ",");
2077 xopts = strtok(NULL, ",")) {
2079 if ((config = strchr(xopts, '=')) != NULL)
2082 if (!strcmp("maxq", xopts)) {
2083 sc->max_queues = atoi(config);
2084 } else if (!strcmp("qsz", xopts)) {
2085 sc->max_qentries = atoi(config);
2086 } else if (!strcmp("ioslots", xopts)) {
2087 sc->ioslots = atoi(config);
2088 } else if (!strcmp("sectsz", xopts)) {
2089 sectsz = atoi(config);
2090 } else if (!strcmp("ser", xopts)) {
2092 * This field indicates the Product Serial Number in
2093 * 7-bit ASCII, unused bytes should be space characters.
2096 cpywithpad((char *)sc->ctrldata.sn,
2097 sizeof(sc->ctrldata.sn), config, ' ');
2098 } else if (!strcmp("ram", xopts)) {
2099 uint64_t sz = strtoull(&xopts[4], NULL, 10);
2101 sc->nvstore.type = NVME_STOR_RAM;
2102 sc->nvstore.size = sz * 1024 * 1024;
2103 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2104 sc->nvstore.sectsz = 4096;
2105 sc->nvstore.sectsz_bits = 12;
2106 if (sc->nvstore.ctx == NULL) {
2107 perror("Unable to allocate RAM");
2111 } else if (!strcmp("eui64", xopts)) {
2112 sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2113 } else if (!strcmp("dsm", xopts)) {
2114 if (!strcmp("auto", config))
2115 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2116 else if (!strcmp("enable", config))
2117 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2118 else if (!strcmp("disable", config))
2119 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2120 } else if (optidx == 0) {
2121 snprintf(bident, sizeof(bident), "%d:%d",
2122 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2123 sc->nvstore.ctx = blockif_open(xopts, bident);
2124 if (sc->nvstore.ctx == NULL) {
2125 perror("Could not open backing file");
2129 sc->nvstore.type = NVME_STOR_BLOCKIF;
2130 sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2132 EPRINTLN("Invalid option %s", xopts);
2141 if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2142 EPRINTLN("backing store not specified");
2145 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2146 sc->nvstore.sectsz = sectsz;
2147 else if (sc->nvstore.type != NVME_STOR_RAM)
2148 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2149 for (sc->nvstore.sectsz_bits = 9;
2150 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2151 sc->nvstore.sectsz_bits++);
2153 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2154 sc->max_queues = NVME_QUEUES;
2156 if (sc->max_qentries <= 0) {
2157 EPRINTLN("Invalid qsz option");
2160 if (sc->ioslots <= 0) {
2161 EPRINTLN("Invalid ioslots option");
2169 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2171 struct pci_nvme_softc *sc;
2172 uint32_t pci_membar_sz;
2177 sc = calloc(1, sizeof(struct pci_nvme_softc));
2181 error = pci_nvme_parse_opts(sc, opts);
2187 STAILQ_INIT(&sc->ioreqs_free);
2188 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2189 for (int i = 0; i < sc->ioslots; i++) {
2190 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2191 pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
2192 pthread_cond_init(&sc->ioreqs[i].cv, NULL);
2194 sc->intr_coales_aggr_thresh = 1;
2196 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2197 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2198 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2199 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2200 pci_set_cfgdata8(pi, PCIR_PROGIF,
2201 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2204 * Allocate size of NVMe registers + doorbell space for all queues.
2206 * The specification requires a minimum memory I/O window size of 16K.
2207 * The Windows driver will refuse to start a device with a smaller
2210 pci_membar_sz = sizeof(struct nvme_registers) +
2211 2 * sizeof(uint32_t) * (sc->max_queues + 1);
2212 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2214 DPRINTF("nvme membar size: %u", pci_membar_sz);
2216 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2218 WPRINTF("%s pci alloc mem bar failed", __func__);
2222 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2224 WPRINTF("%s pci add msixcap failed", __func__);
2228 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2230 WPRINTF("%s pci add Express capability failed", __func__);
2234 pthread_mutex_init(&sc->mtx, NULL);
2235 sem_init(&sc->iosemlock, 0, sc->ioslots);
2239 * Controller data depends on Namespace data so initialize Namespace
2242 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2243 pci_nvme_init_ctrldata(sc);
2244 pci_nvme_init_logpages(sc);
2246 pci_lintr_request(pi);
2253 struct pci_devemu pci_de_nvme = {
2255 .pe_init = pci_nvme_init,
2256 .pe_barwrite = pci_nvme_write,
2257 .pe_barread = pci_nvme_read
2259 PCI_EMUL_SET(pci_de_nvme);